diff --git a/node/AES.cpp b/node/AES.cpp index e27fe904b..7313bfaa4 100644 --- a/node/AES.cpp +++ b/node/AES.cpp @@ -508,6 +508,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept out[totalLen++] = *(in++); if (!(totalLen & 15U)) { __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); + if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); d0 = _mm_xor_si128(d0,k0); d0 = _mm_aesenc_si128(d0,k1); d0 = _mm_aesenc_si128(d0,k2); @@ -526,7 +527,6 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept d0 = _mm_aesenc_si128(d0,k13); d0 = _mm_aesenclast_si128(d0,k14); _mm_storeu_si128(outblk,_mm_xor_si128(p0,d0)); - if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); break; } } @@ -535,147 +535,265 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept out += totalLen; _len = (totalLen + len); - while (len >= 64) { - __m128i d0,d1,d2,d3; - if (likely(c1 < 0xfffffffffffffffcULL)) { - d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0); - d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0); - d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0); + if (likely((c1 + len) > c1)) { // it's incredibly likely that we can ignore carry in counter increment + while (len >= 64) { + __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); + __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0); + __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0); + __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0); c1 += 4; - } else { - d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - d1 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - d2 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - d3 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + + d0 = _mm_xor_si128(d0,k0); + d1 = _mm_xor_si128(d1,k0); + d2 = _mm_xor_si128(d2,k0); + d3 = _mm_xor_si128(d3,k0); + d0 = _mm_aesenc_si128(d0,k1); + d1 = _mm_aesenc_si128(d1,k1); + d2 = _mm_aesenc_si128(d2,k1); + d3 = _mm_aesenc_si128(d3,k1); + __m128i ka = k[6]; + d0 = _mm_aesenc_si128(d0,k2); + d1 = _mm_aesenc_si128(d1,k2); + d2 = _mm_aesenc_si128(d2,k2); + d3 = _mm_aesenc_si128(d3,k2); + __m128i kb = k[7]; + d0 = _mm_aesenc_si128(d0,k3); + d1 = _mm_aesenc_si128(d1,k3); + d2 = _mm_aesenc_si128(d2,k3); + d3 = _mm_aesenc_si128(d3,k3); + __m128i kc = k[8]; + d0 = _mm_aesenc_si128(d0,k4); + d1 = _mm_aesenc_si128(d1,k4); + d2 = _mm_aesenc_si128(d2,k4); + d3 = _mm_aesenc_si128(d3,k4); + __m128i kd = k[9]; + d0 = _mm_aesenc_si128(d0,k5); + d1 = _mm_aesenc_si128(d1,k5); + d2 = _mm_aesenc_si128(d2,k5); + d3 = _mm_aesenc_si128(d3,k5); + __m128i ke = k[10]; + d0 = _mm_aesenc_si128(d0,ka); + d1 = _mm_aesenc_si128(d1,ka); + d2 = _mm_aesenc_si128(d2,ka); + d3 = _mm_aesenc_si128(d3,ka); + __m128i kf = k[11]; + d0 = _mm_aesenc_si128(d0,kb); + d1 = _mm_aesenc_si128(d1,kb); + d2 = _mm_aesenc_si128(d2,kb); + d3 = _mm_aesenc_si128(d3,kb); + ka = k[12]; + d0 = _mm_aesenc_si128(d0,kc); + d1 = _mm_aesenc_si128(d1,kc); + d2 = _mm_aesenc_si128(d2,kc); + d3 = _mm_aesenc_si128(d3,kc); + kb = k[13]; + d0 = _mm_aesenc_si128(d0,kd); + d1 = _mm_aesenc_si128(d1,kd); + d2 = _mm_aesenc_si128(d2,kd); + d3 = _mm_aesenc_si128(d3,kd); + kc = k[14]; + d0 = _mm_aesenc_si128(d0,ke); + d1 = _mm_aesenc_si128(d1,ke); + d2 = _mm_aesenc_si128(d2,ke); + d3 = _mm_aesenc_si128(d3,ke); + kd = _mm_loadu_si128(reinterpret_cast(in)); + d0 = _mm_aesenc_si128(d0,kf); + d1 = _mm_aesenc_si128(d1,kf); + d2 = _mm_aesenc_si128(d2,kf); + d3 = _mm_aesenc_si128(d3,kf); + ke = _mm_loadu_si128(reinterpret_cast(in + 16)); + d0 = _mm_aesenc_si128(d0,ka); + d1 = _mm_aesenc_si128(d1,ka); + d2 = _mm_aesenc_si128(d2,ka); + d3 = _mm_aesenc_si128(d3,ka); + kf = _mm_loadu_si128(reinterpret_cast(in + 32)); + d0 = _mm_aesenc_si128(d0,kb); + d1 = _mm_aesenc_si128(d1,kb); + d2 = _mm_aesenc_si128(d2,kb); + d3 = _mm_aesenc_si128(d3,kb); + ka = _mm_loadu_si128(reinterpret_cast(in + 48)); + d0 = _mm_aesenclast_si128(d0,kc); + d1 = _mm_aesenclast_si128(d1,kc); + d2 = _mm_aesenclast_si128(d2,kc); + d3 = _mm_aesenclast_si128(d3,kc); + kd = _mm_xor_si128(d0,kd); + ke = _mm_xor_si128(d1,ke); + kf = _mm_xor_si128(d2,kf); + ka = _mm_xor_si128(d3,ka); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka); + + in += 64; + len -= 64; + out += 64; } - d0 = _mm_xor_si128(d0,k0); - d1 = _mm_xor_si128(d1,k0); - d2 = _mm_xor_si128(d2,k0); - d3 = _mm_xor_si128(d3,k0); - d0 = _mm_aesenc_si128(d0,k1); - d1 = _mm_aesenc_si128(d1,k1); - d2 = _mm_aesenc_si128(d2,k1); - d3 = _mm_aesenc_si128(d3,k1); - __m128i ka = k[6]; - d0 = _mm_aesenc_si128(d0,k2); - d1 = _mm_aesenc_si128(d1,k2); - d2 = _mm_aesenc_si128(d2,k2); - d3 = _mm_aesenc_si128(d3,k2); - __m128i kb = k[7]; - d0 = _mm_aesenc_si128(d0,k3); - d1 = _mm_aesenc_si128(d1,k3); - d2 = _mm_aesenc_si128(d2,k3); - d3 = _mm_aesenc_si128(d3,k3); - __m128i kc = k[8]; - d0 = _mm_aesenc_si128(d0,k4); - d1 = _mm_aesenc_si128(d1,k4); - d2 = _mm_aesenc_si128(d2,k4); - d3 = _mm_aesenc_si128(d3,k4); - __m128i kd = k[9]; - d0 = _mm_aesenc_si128(d0,k5); - d1 = _mm_aesenc_si128(d1,k5); - d2 = _mm_aesenc_si128(d2,k5); - d3 = _mm_aesenc_si128(d3,k5); - __m128i ke = k[10]; - d0 = _mm_aesenc_si128(d0,ka); - d1 = _mm_aesenc_si128(d1,ka); - d2 = _mm_aesenc_si128(d2,ka); - d3 = _mm_aesenc_si128(d3,ka); - __m128i kf = k[11]; - d0 = _mm_aesenc_si128(d0,kb); - d1 = _mm_aesenc_si128(d1,kb); - d2 = _mm_aesenc_si128(d2,kb); - d3 = _mm_aesenc_si128(d3,kb); - ka = k[12]; - d0 = _mm_aesenc_si128(d0,kc); - d1 = _mm_aesenc_si128(d1,kc); - d2 = _mm_aesenc_si128(d2,kc); - d3 = _mm_aesenc_si128(d3,kc); - kb = k[13]; - d0 = _mm_aesenc_si128(d0,kd); - d1 = _mm_aesenc_si128(d1,kd); - d2 = _mm_aesenc_si128(d2,kd); - d3 = _mm_aesenc_si128(d3,kd); - kc = k[14]; - d0 = _mm_aesenc_si128(d0,ke); - d1 = _mm_aesenc_si128(d1,ke); - d2 = _mm_aesenc_si128(d2,ke); - d3 = _mm_aesenc_si128(d3,ke); - kd = _mm_loadu_si128(reinterpret_cast(in)); - d0 = _mm_aesenc_si128(d0,kf); - d1 = _mm_aesenc_si128(d1,kf); - d2 = _mm_aesenc_si128(d2,kf); - d3 = _mm_aesenc_si128(d3,kf); - ke = _mm_loadu_si128(reinterpret_cast(in + 16)); - d0 = _mm_aesenc_si128(d0,ka); - d1 = _mm_aesenc_si128(d1,ka); - d2 = _mm_aesenc_si128(d2,ka); - d3 = _mm_aesenc_si128(d3,ka); - kf = _mm_loadu_si128(reinterpret_cast(in + 32)); - d0 = _mm_aesenc_si128(d0,kb); - d1 = _mm_aesenc_si128(d1,kb); - d2 = _mm_aesenc_si128(d2,kb); - d3 = _mm_aesenc_si128(d3,kb); - ka = _mm_loadu_si128(reinterpret_cast(in + 48)); - d0 = _mm_aesenclast_si128(d0,kc); - d1 = _mm_aesenclast_si128(d1,kc); - d2 = _mm_aesenclast_si128(d2,kc); - d3 = _mm_aesenclast_si128(d3,kc); - kd = _mm_xor_si128(d0,kd); - ke = _mm_xor_si128(d1,ke); - kf = _mm_xor_si128(d2,kf); - ka = _mm_xor_si128(d3,ka); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka); + if (len >= 16) { + const __m128i k7 = k[7]; + const __m128i k8 = k[8]; + const __m128i k9 = k[9]; + const __m128i k10 = k[10]; + const __m128i k11 = k[11]; + const __m128i k12 = k[12]; + const __m128i k13 = k[13]; + const __m128i k14 = k[14]; + do { + __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); + d0 = _mm_xor_si128(d0,k0); + d0 = _mm_aesenc_si128(d0,k1); + d0 = _mm_aesenc_si128(d0,k2); + d0 = _mm_aesenc_si128(d0,k3); + d0 = _mm_aesenc_si128(d0,k4); + d0 = _mm_aesenc_si128(d0,k5); + d0 = _mm_aesenc_si128(d0,k[6]); + d0 = _mm_aesenc_si128(d0,k7); + d0 = _mm_aesenc_si128(d0,k8); + d0 = _mm_aesenc_si128(d0,k9); + d0 = _mm_aesenc_si128(d0,k10); + d0 = _mm_aesenc_si128(d0,k11); + d0 = _mm_aesenc_si128(d0,k12); + d0 = _mm_aesenc_si128(d0,k13); + d0 = _mm_aesenclast_si128(d0,k14); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast(in)))); - in += 64; - len -= 64; - out += 64; - } - - { - const __m128i k7 = k[7]; - const __m128i k8 = k[8]; - const __m128i k9 = k[9]; - const __m128i k10 = k[10]; - const __m128i k11 = k[11]; - const __m128i k12 = k[12]; - const __m128i k13 = k[13]; - const __m128i k14 = k[14]; - while (len >= 16) { + in += 16; + len -= 16; + out += 16; + } while (len >= 16); + } + } else { + while (len >= 64) { __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - if (unlikely(c1 == 0)) { - c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - } - d0 = _mm_xor_si128(d0,k0); - d0 = _mm_aesenc_si128(d0,k1); - d0 = _mm_aesenc_si128(d0,k2); - d0 = _mm_aesenc_si128(d0,k3); - d0 = _mm_aesenc_si128(d0,k4); - d0 = _mm_aesenc_si128(d0,k5); - d0 = _mm_aesenc_si128(d0,k[6]); - d0 = _mm_aesenc_si128(d0,k7); - d0 = _mm_aesenc_si128(d0,k8); - d0 = _mm_aesenc_si128(d0,k9); - d0 = _mm_aesenc_si128(d0,k10); - d0 = _mm_aesenc_si128(d0,k11); - d0 = _mm_aesenc_si128(d0,k12); - d0 = _mm_aesenc_si128(d0,k13); - d0 = _mm_aesenclast_si128(d0,k14); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast(in)))); + if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); + if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); + if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); + if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - in += 16; - len -= 16; - out += 16; + d0 = _mm_xor_si128(d0,k0); + d1 = _mm_xor_si128(d1,k0); + d2 = _mm_xor_si128(d2,k0); + d3 = _mm_xor_si128(d3,k0); + d0 = _mm_aesenc_si128(d0,k1); + d1 = _mm_aesenc_si128(d1,k1); + d2 = _mm_aesenc_si128(d2,k1); + d3 = _mm_aesenc_si128(d3,k1); + __m128i ka = k[6]; + d0 = _mm_aesenc_si128(d0,k2); + d1 = _mm_aesenc_si128(d1,k2); + d2 = _mm_aesenc_si128(d2,k2); + d3 = _mm_aesenc_si128(d3,k2); + __m128i kb = k[7]; + d0 = _mm_aesenc_si128(d0,k3); + d1 = _mm_aesenc_si128(d1,k3); + d2 = _mm_aesenc_si128(d2,k3); + d3 = _mm_aesenc_si128(d3,k3); + __m128i kc = k[8]; + d0 = _mm_aesenc_si128(d0,k4); + d1 = _mm_aesenc_si128(d1,k4); + d2 = _mm_aesenc_si128(d2,k4); + d3 = _mm_aesenc_si128(d3,k4); + __m128i kd = k[9]; + d0 = _mm_aesenc_si128(d0,k5); + d1 = _mm_aesenc_si128(d1,k5); + d2 = _mm_aesenc_si128(d2,k5); + d3 = _mm_aesenc_si128(d3,k5); + __m128i ke = k[10]; + d0 = _mm_aesenc_si128(d0,ka); + d1 = _mm_aesenc_si128(d1,ka); + d2 = _mm_aesenc_si128(d2,ka); + d3 = _mm_aesenc_si128(d3,ka); + __m128i kf = k[11]; + d0 = _mm_aesenc_si128(d0,kb); + d1 = _mm_aesenc_si128(d1,kb); + d2 = _mm_aesenc_si128(d2,kb); + d3 = _mm_aesenc_si128(d3,kb); + ka = k[12]; + d0 = _mm_aesenc_si128(d0,kc); + d1 = _mm_aesenc_si128(d1,kc); + d2 = _mm_aesenc_si128(d2,kc); + d3 = _mm_aesenc_si128(d3,kc); + kb = k[13]; + d0 = _mm_aesenc_si128(d0,kd); + d1 = _mm_aesenc_si128(d1,kd); + d2 = _mm_aesenc_si128(d2,kd); + d3 = _mm_aesenc_si128(d3,kd); + kc = k[14]; + d0 = _mm_aesenc_si128(d0,ke); + d1 = _mm_aesenc_si128(d1,ke); + d2 = _mm_aesenc_si128(d2,ke); + d3 = _mm_aesenc_si128(d3,ke); + kd = _mm_loadu_si128(reinterpret_cast(in)); + d0 = _mm_aesenc_si128(d0,kf); + d1 = _mm_aesenc_si128(d1,kf); + d2 = _mm_aesenc_si128(d2,kf); + d3 = _mm_aesenc_si128(d3,kf); + ke = _mm_loadu_si128(reinterpret_cast(in + 16)); + d0 = _mm_aesenc_si128(d0,ka); + d1 = _mm_aesenc_si128(d1,ka); + d2 = _mm_aesenc_si128(d2,ka); + d3 = _mm_aesenc_si128(d3,ka); + kf = _mm_loadu_si128(reinterpret_cast(in + 32)); + d0 = _mm_aesenc_si128(d0,kb); + d1 = _mm_aesenc_si128(d1,kb); + d2 = _mm_aesenc_si128(d2,kb); + d3 = _mm_aesenc_si128(d3,kb); + ka = _mm_loadu_si128(reinterpret_cast(in + 48)); + d0 = _mm_aesenclast_si128(d0,kc); + d1 = _mm_aesenclast_si128(d1,kc); + d2 = _mm_aesenclast_si128(d2,kc); + d3 = _mm_aesenclast_si128(d3,kc); + kd = _mm_xor_si128(d0,kd); + ke = _mm_xor_si128(d1,ke); + kf = _mm_xor_si128(d2,kf); + ka = _mm_xor_si128(d3,ka); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka); + + in += 64; + len -= 64; + out += 64; + } + + if (len >= 16) { + const __m128i k7 = k[7]; + const __m128i k8 = k[8]; + const __m128i k9 = k[9]; + const __m128i k10 = k[10]; + const __m128i k11 = k[11]; + const __m128i k12 = k[12]; + const __m128i k13 = k[13]; + const __m128i k14 = k[14]; + do { + __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); + if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); + d0 = _mm_xor_si128(d0,k0); + d0 = _mm_aesenc_si128(d0,k1); + d0 = _mm_aesenc_si128(d0,k2); + d0 = _mm_aesenc_si128(d0,k3); + d0 = _mm_aesenc_si128(d0,k4); + d0 = _mm_aesenc_si128(d0,k5); + d0 = _mm_aesenc_si128(d0,k[6]); + d0 = _mm_aesenc_si128(d0,k7); + d0 = _mm_aesenc_si128(d0,k8); + d0 = _mm_aesenc_si128(d0,k9); + d0 = _mm_aesenc_si128(d0,k10); + d0 = _mm_aesenc_si128(d0,k11); + d0 = _mm_aesenc_si128(d0,k12); + d0 = _mm_aesenc_si128(d0,k13); + d0 = _mm_aesenclast_si128(d0,k14); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast(in)))); + + in += 16; + len -= 16; + out += 16; + } while (len >= 16); } }