From ec76f6e1d2b6798e898a8ddc3ef4651bc250c561 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 3 Dec 2020 22:29:38 -0500 Subject: [PATCH] Mirror over a few more AES fixes. --- core/AES.cpp | 22 ++++++---------------- core/AES_aesni.cpp | 22 ++++++++++++++++++++++ core/AES_armcrypto.cpp | 17 ++++++++--------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/core/AES.cpp b/core/AES.cpp index 8402fc908..8f2f30d20 100644 --- a/core/AES.cpp +++ b/core/AES.cpp @@ -149,22 +149,12 @@ void AES::GMAC::update(const void *const data, unsigned int len) noexcept } } - if (likely(((uintptr_t)in & 7U) == 0U)) { - while (len >= 16) { - y0 ^= *reinterpret_cast(in); - y1 ^= *reinterpret_cast(in + 8); - in += 16; - s_gfmul(h0, h1, y0, y1); - len -= 16; - } - } else { - while (len >= 16) { - y0 ^= Utils::loadMachineEndian< uint64_t >(in); - y1 ^= Utils::loadMachineEndian< uint64_t >(in + 8); - in += 16; - s_gfmul(h0, h1, y0, y1); - len -= 16; - } + while (len >= 16) { + y0 ^= Utils::loadMachineEndian< uint64_t >(in); + y1 ^= Utils::loadMachineEndian< uint64_t >(in + 8); + in += 16; + s_gfmul(h0, h1, y0, y1); + len -= 16; } _y[0] = y0; diff --git a/core/AES_aesni.cpp b/core/AES_aesni.cpp index 6fe705ecf..a185b1b36 100644 --- a/core/AES_aesni.cpp +++ b/core/AES_aesni.cpp @@ -26,7 +26,9 @@ namespace { const __m128i s_sseSwapBytes = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,pclmul"))) +#endif __m128i p_gmacPCLMUL128(const __m128i h, __m128i y) noexcept { y = _mm_shuffle_epi8(y, s_sseSwapBytes); @@ -57,7 +59,9 @@ __m128i p_gmacPCLMUL128(const __m128i h, __m128i y) noexcept #define ZT_AES_VAES512 1 +#ifdef __GNUC__ __attribute__((__target__("sse4,aes,avx,avx2,vaes,avx512f,avx512bw"))) +#endif void p_aesCtrInnerVAES512(unsigned int &len, const uint64_t c0, uint64_t &c1, const uint8_t *&in, uint8_t *&out, const __m128i *const k) noexcept { const __m512i kk0 = _mm512_broadcast_i32x4(k[0]); @@ -107,7 +111,9 @@ void p_aesCtrInnerVAES512(unsigned int &len, const uint64_t c0, uint64_t &c1, co #define ZT_AES_VAES256 1 +#ifdef __GNUC__ __attribute__((__target__("sse4,aes,avx,avx2,vaes"))) +#endif void p_aesCtrInnerVAES256(unsigned int &len, const uint64_t c0, uint64_t &c1, const uint8_t *&in, uint8_t *&out, const __m128i *const k) noexcept { const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]); @@ -175,7 +181,9 @@ void p_aesCtrInnerVAES256(unsigned int &len, const uint64_t c0, uint64_t &c1, co #endif // does compiler support AVX2 and AVX512 AES intrinsics? +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +#endif __m128i p_init256_1_aesni(__m128i a, __m128i b) noexcept { __m128i x, y; @@ -190,7 +198,9 @@ __m128i p_init256_1_aesni(__m128i a, __m128i b) noexcept return x; } +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +#endif __m128i p_init256_2_aesni(__m128i a, __m128i b) noexcept { __m128i x, y, z; @@ -208,7 +218,9 @@ __m128i p_init256_2_aesni(__m128i a, __m128i b) noexcept } // anonymous namespace +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,pclmul"))) +#endif void AES::GMAC::p_aesNIUpdate(const uint8_t *in, unsigned int len) noexcept { __m128i y = _mm_loadu_si128(reinterpret_cast(_y)); @@ -274,7 +286,9 @@ void AES::GMAC::p_aesNIUpdate(const uint8_t *in, unsigned int len) noexcept _rp = len; // len is always less than 16 here } +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,pclmul,aes"))) +#endif void AES::GMAC::p_aesNIFinish(uint8_t tag[16]) noexcept { __m128i y = _mm_loadu_si128(reinterpret_cast(_y)); @@ -345,7 +359,9 @@ void AES::GMAC::p_aesNIFinish(uint8_t tag[16]) noexcept _mm_storeu_si128(reinterpret_cast<__m128i *>(tag), _mm_xor_si128(_mm_shuffle_epi8(t4, s_sseSwapBytes), encIV)); } +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes"))) +#endif void AES::CTR::p_aesNICrypt(const uint8_t *in, uint8_t *out, unsigned int len) noexcept { const __m128i dd = _mm_set_epi64x(0, (long long)_ctr[0]); @@ -542,7 +558,9 @@ void AES::CTR::p_aesNICrypt(const uint8_t *in, uint8_t *out, unsigned int len) n _ctr[1] = Utils::hton(c1); } +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +#endif void AES::p_init_aesni(const uint8_t *key) noexcept { __m128i t1, t2, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13; @@ -604,7 +622,9 @@ void AES::p_init_aesni(const uint8_t *key) noexcept p_k.ni.h2[3] = _mm_xor_si128(_mm_shuffle_epi32(hhhh, 78), hhhh); } +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +#endif void AES::p_encrypt_aesni(const void *const in, void *const out) const noexcept { __m128i tmp = _mm_loadu_si128((const __m128i *)in); @@ -625,7 +645,9 @@ void AES::p_encrypt_aesni(const void *const in, void *const out) const noexcept _mm_storeu_si128((__m128i *)out, _mm_aesenclast_si128(tmp, p_k.ni.k[14])); } +#ifdef __GNUC__ __attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +#endif void AES::p_decrypt_aesni(const void *in, void *out) const noexcept { __m128i tmp = _mm_loadu_si128((const __m128i *)in); diff --git a/core/AES_armcrypto.cpp b/core/AES_armcrypto.cpp index 30a7ec35a..c77aa0779 100644 --- a/core/AES_armcrypto.cpp +++ b/core/AES_armcrypto.cpp @@ -131,7 +131,7 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe uint8x16_t k14 = _aes.p_k.neon.ek[14]; unsigned int totalLen = _len; - if ((totalLen & 15U)) { + if ((totalLen & 15U) != 0) { for (;;) { if (unlikely(!len)) { vst1q_u8(reinterpret_cast(_ctr), vrev32q_u8(dd)); @@ -140,7 +140,7 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe } --len; out[totalLen++] = *(in++); - if (!(totalLen & 15U)) { + if ((totalLen & 15U) == 0) { uint8_t *const otmp = out + (totalLen - 16); uint8x16_t d0 = vrev32q_u8(dd); uint8x16_t pt = vld1q_u8(otmp); @@ -180,7 +180,10 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe uint8x16_t d2 = vrev32q_u8(dd2); uint8x16_t d3 = vrev32q_u8(dd3); uint8x16_t pt0 = vld1q_u8(in); - in += 16; + uint8x16_t pt1 = vld1q_u8(in + 16); + uint8x16_t pt2 = vld1q_u8(in + 32); + uint8x16_t pt3 = vld1q_u8(in + 48); + d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); d1 = vaesmcq_u8(vaeseq_u8(d1, k0)); d2 = vaesmcq_u8(vaeseq_u8(d2, k0)); @@ -193,8 +196,6 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe d1 = vaesmcq_u8(vaeseq_u8(d1, k2)); d2 = vaesmcq_u8(vaeseq_u8(d2, k2)); d3 = vaesmcq_u8(vaeseq_u8(d3, k2)); - uint8x16_t pt1 = vld1q_u8(in); - in += 16; d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); d1 = vaesmcq_u8(vaeseq_u8(d1, k3)); d2 = vaesmcq_u8(vaeseq_u8(d2, k3)); @@ -207,8 +208,6 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe d1 = vaesmcq_u8(vaeseq_u8(d1, k5)); d2 = vaesmcq_u8(vaeseq_u8(d2, k5)); d3 = vaesmcq_u8(vaeseq_u8(d3, k5)); - uint8x16_t pt2 = vld1q_u8(in); - in += 16; d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); d1 = vaesmcq_u8(vaeseq_u8(d1, k6)); d2 = vaesmcq_u8(vaeseq_u8(d2, k6)); @@ -221,8 +220,6 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe d1 = vaesmcq_u8(vaeseq_u8(d1, k8)); d2 = vaesmcq_u8(vaeseq_u8(d2, k8)); d3 = vaesmcq_u8(vaeseq_u8(d3, k8)); - uint8x16_t pt3 = vld1q_u8(in); - in += 16; d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); d1 = vaesmcq_u8(vaeseq_u8(d1, k9)); d2 = vaesmcq_u8(vaeseq_u8(d2, k9)); @@ -253,7 +250,9 @@ void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noe vst1q_u8(out + 16, d1); vst1q_u8(out + 32, d2); vst1q_u8(out + 48, d3); + out += 64; + in += 64; dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, four); if (unlikely(len < 64))