diff --git a/node/AES.cpp b/node/AES.cpp index 4b2cbb149..3df1474a4 100644 --- a/node/AES.cpp +++ b/node/AES.cpp @@ -478,7 +478,8 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept uint64_t c1 = Utils::ntoh(_ctr[1]); // There are 16 XMM registers. We can reserve six of them for the - // first six parts of the expanded AES key. + // first six parts of the expanded AES key. The rest are used for + // other key material, counter, or data depending on the chunk size. const __m128i k0 = _aes._k.ni.k[0]; const __m128i k1 = _aes._k.ni.k[1]; const __m128i k2 = _aes._k.ni.k[2]; @@ -642,41 +643,43 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept out += 64; } - while (len >= 16) { - __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - d0 = _mm_xor_si128(d0,k0); - d0 = _mm_aesenc_si128(d0,k1); + { __m128i ka = _aes._k.ni.k[6]; - d0 = _mm_aesenc_si128(d0,k2); __m128i kb = _aes._k.ni.k[7]; - d0 = _mm_aesenc_si128(d0,k3); - __m128i kc = _aes._k.ni.k[8]; - d0 = _mm_aesenc_si128(d0,k4); - __m128i kd = _aes._k.ni.k[9]; - d0 = _mm_aesenc_si128(d0,k5); - __m128i ke = _aes._k.ni.k[10]; - d0 = _mm_aesenc_si128(d0,ka); - __m128i kf = _aes._k.ni.k[11]; - d0 = _mm_aesenc_si128(d0,kb); - __m128i kg = _aes._k.ni.k[12]; - d0 = _mm_aesenc_si128(d0,kc); - __m128i p0 = _mm_loadu_si128(reinterpret_cast(in)); - d0 = _mm_aesenc_si128(d0,kd); - __m128i kh = _aes._k.ni.k[13]; - d0 = _mm_aesenc_si128(d0,ke); - ka = _aes._k.ni.k[14]; - d0 = _mm_aesenc_si128(d0,kf); - d0 = _mm_aesenc_si128(d0,kg); - d0 = _mm_aesenc_si128(d0,kh); - d0 = _mm_aesenclast_si128(d0,ka); - p0 = _mm_xor_si128(d0,p0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),p0); + const __m128i kc = _aes._k.ni.k[8]; + const __m128i kd = _aes._k.ni.k[9]; + const __m128i ke = _aes._k.ni.k[10]; + const __m128i kf = _aes._k.ni.k[11]; + const __m128i kg = _aes._k.ni.k[12]; + const __m128i kh = _aes._k.ni.k[13]; + while (len >= 16) { + __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); + d0 = _mm_xor_si128(d0,k0); + d0 = _mm_aesenc_si128(d0,k1); + d0 = _mm_aesenc_si128(d0,k2); + d0 = _mm_aesenc_si128(d0,k3); + d0 = _mm_aesenc_si128(d0,k4); + d0 = _mm_aesenc_si128(d0,k5); + d0 = _mm_aesenc_si128(d0,ka); + d0 = _mm_aesenc_si128(d0,kb); + d0 = _mm_aesenc_si128(d0,kc); + d0 = _mm_aesenc_si128(d0,kd); + ka = _aes._k.ni.k[14]; + d0 = _mm_aesenc_si128(d0,ke); + d0 = _mm_aesenc_si128(d0,kf); + d0 = _mm_aesenc_si128(d0,kg); + d0 = _mm_aesenc_si128(d0,kh); + kb = _mm_loadu_si128(reinterpret_cast(in)); + d0 = _mm_aesenclast_si128(d0,ka); + kb = _mm_xor_si128(d0,kb); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kb); - in += 16; - len -= 16; - out += 16; + in += 16; + len -= 16; + out += 16; - if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + } } // Any remaining input is placed in _out. This will be picked up and crypted