From d151d731a6f69abd25ade5b1b08fd56004ce4e94 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 26 Mar 2020 10:28:59 -0700 Subject: [PATCH] (1) Mask bit 31 of AES-CTR nonce to disallow uint32 overflow, (2) get rid of handling of uint32/uint64 overflow in our AES-CTR code to optimize, (3) optimize AES software a bit --- node/AES.cpp | 429 +++++++++++++++++---------------------------------- node/AES.hpp | 21 ++- 2 files changed, 165 insertions(+), 285 deletions(-) diff --git a/node/AES.cpp b/node/AES.cpp index 9fa6a9456..d606524c9 100644 --- a/node/AES.cpp +++ b/node/AES.cpp @@ -535,266 +535,132 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept out += totalLen; _len = (totalLen + len); - if (likely((c1 + len) > c1)) { // if this is true we can just increment c1 and ignore c0 - while (len >= 64) { - __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); - __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0); - __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0); - __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0); - c1 += 4; + while (len >= 64) { + __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); + __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0); + __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0); + __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0); + c1 += 4; - d0 = _mm_xor_si128(d0,k0); - d1 = _mm_xor_si128(d1,k0); - d2 = _mm_xor_si128(d2,k0); - d3 = _mm_xor_si128(d3,k0); - d0 = _mm_aesenc_si128(d0,k1); - d1 = _mm_aesenc_si128(d1,k1); - d2 = _mm_aesenc_si128(d2,k1); - d3 = _mm_aesenc_si128(d3,k1); - __m128i ka = k[6]; - d0 = _mm_aesenc_si128(d0,k2); - d1 = _mm_aesenc_si128(d1,k2); - d2 = _mm_aesenc_si128(d2,k2); - d3 = _mm_aesenc_si128(d3,k2); - __m128i kb = k[7]; - d0 = _mm_aesenc_si128(d0,k3); - d1 = _mm_aesenc_si128(d1,k3); - d2 = _mm_aesenc_si128(d2,k3); - d3 = _mm_aesenc_si128(d3,k3); - __m128i kc = k[8]; - d0 = _mm_aesenc_si128(d0,k4); - d1 = _mm_aesenc_si128(d1,k4); - d2 = _mm_aesenc_si128(d2,k4); - d3 = _mm_aesenc_si128(d3,k4); - __m128i kd = k[9]; - d0 = _mm_aesenc_si128(d0,k5); - d1 = _mm_aesenc_si128(d1,k5); - d2 = _mm_aesenc_si128(d2,k5); - d3 = _mm_aesenc_si128(d3,k5); - __m128i ke = k[10]; - d0 = _mm_aesenc_si128(d0,ka); - d1 = _mm_aesenc_si128(d1,ka); - d2 = _mm_aesenc_si128(d2,ka); - d3 = _mm_aesenc_si128(d3,ka); - __m128i kf = k[11]; - d0 = _mm_aesenc_si128(d0,kb); - d1 = _mm_aesenc_si128(d1,kb); - d2 = _mm_aesenc_si128(d2,kb); - d3 = _mm_aesenc_si128(d3,kb); - ka = k[12]; - d0 = _mm_aesenc_si128(d0,kc); - d1 = _mm_aesenc_si128(d1,kc); - d2 = _mm_aesenc_si128(d2,kc); - d3 = _mm_aesenc_si128(d3,kc); - kb = k[13]; - d0 = _mm_aesenc_si128(d0,kd); - d1 = _mm_aesenc_si128(d1,kd); - d2 = _mm_aesenc_si128(d2,kd); - d3 = _mm_aesenc_si128(d3,kd); - kc = k[14]; - d0 = _mm_aesenc_si128(d0,ke); - d1 = _mm_aesenc_si128(d1,ke); - d2 = _mm_aesenc_si128(d2,ke); - d3 = _mm_aesenc_si128(d3,ke); - kd = _mm_loadu_si128(reinterpret_cast(in)); - d0 = _mm_aesenc_si128(d0,kf); - d1 = _mm_aesenc_si128(d1,kf); - d2 = _mm_aesenc_si128(d2,kf); - d3 = _mm_aesenc_si128(d3,kf); - ke = _mm_loadu_si128(reinterpret_cast(in + 16)); - d0 = _mm_aesenc_si128(d0,ka); - d1 = _mm_aesenc_si128(d1,ka); - d2 = _mm_aesenc_si128(d2,ka); - d3 = _mm_aesenc_si128(d3,ka); - kf = _mm_loadu_si128(reinterpret_cast(in + 32)); - d0 = _mm_aesenc_si128(d0,kb); - d1 = _mm_aesenc_si128(d1,kb); - d2 = _mm_aesenc_si128(d2,kb); - d3 = _mm_aesenc_si128(d3,kb); - ka = _mm_loadu_si128(reinterpret_cast(in + 48)); - d0 = _mm_aesenclast_si128(d0,kc); - d1 = _mm_aesenclast_si128(d1,kc); - d2 = _mm_aesenclast_si128(d2,kc); - d3 = _mm_aesenclast_si128(d3,kc); - kd = _mm_xor_si128(d0,kd); - ke = _mm_xor_si128(d1,ke); - kf = _mm_xor_si128(d2,kf); - ka = _mm_xor_si128(d3,ka); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka); + d0 = _mm_xor_si128(d0,k0); + d1 = _mm_xor_si128(d1,k0); + d2 = _mm_xor_si128(d2,k0); + d3 = _mm_xor_si128(d3,k0); + d0 = _mm_aesenc_si128(d0,k1); + d1 = _mm_aesenc_si128(d1,k1); + d2 = _mm_aesenc_si128(d2,k1); + d3 = _mm_aesenc_si128(d3,k1); + __m128i ka = k[6]; + d0 = _mm_aesenc_si128(d0,k2); + d1 = _mm_aesenc_si128(d1,k2); + d2 = _mm_aesenc_si128(d2,k2); + d3 = _mm_aesenc_si128(d3,k2); + __m128i kb = k[7]; + d0 = _mm_aesenc_si128(d0,k3); + d1 = _mm_aesenc_si128(d1,k3); + d2 = _mm_aesenc_si128(d2,k3); + d3 = _mm_aesenc_si128(d3,k3); + __m128i kc = k[8]; + d0 = _mm_aesenc_si128(d0,k4); + d1 = _mm_aesenc_si128(d1,k4); + d2 = _mm_aesenc_si128(d2,k4); + d3 = _mm_aesenc_si128(d3,k4); + __m128i kd = k[9]; + d0 = _mm_aesenc_si128(d0,k5); + d1 = _mm_aesenc_si128(d1,k5); + d2 = _mm_aesenc_si128(d2,k5); + d3 = _mm_aesenc_si128(d3,k5); + __m128i ke = k[10]; + d0 = _mm_aesenc_si128(d0,ka); + d1 = _mm_aesenc_si128(d1,ka); + d2 = _mm_aesenc_si128(d2,ka); + d3 = _mm_aesenc_si128(d3,ka); + __m128i kf = k[11]; + d0 = _mm_aesenc_si128(d0,kb); + d1 = _mm_aesenc_si128(d1,kb); + d2 = _mm_aesenc_si128(d2,kb); + d3 = _mm_aesenc_si128(d3,kb); + ka = k[12]; + d0 = _mm_aesenc_si128(d0,kc); + d1 = _mm_aesenc_si128(d1,kc); + d2 = _mm_aesenc_si128(d2,kc); + d3 = _mm_aesenc_si128(d3,kc); + kb = k[13]; + d0 = _mm_aesenc_si128(d0,kd); + d1 = _mm_aesenc_si128(d1,kd); + d2 = _mm_aesenc_si128(d2,kd); + d3 = _mm_aesenc_si128(d3,kd); + kc = k[14]; + d0 = _mm_aesenc_si128(d0,ke); + d1 = _mm_aesenc_si128(d1,ke); + d2 = _mm_aesenc_si128(d2,ke); + d3 = _mm_aesenc_si128(d3,ke); + kd = _mm_loadu_si128(reinterpret_cast(in)); + d0 = _mm_aesenc_si128(d0,kf); + d1 = _mm_aesenc_si128(d1,kf); + d2 = _mm_aesenc_si128(d2,kf); + d3 = _mm_aesenc_si128(d3,kf); + ke = _mm_loadu_si128(reinterpret_cast(in + 16)); + d0 = _mm_aesenc_si128(d0,ka); + d1 = _mm_aesenc_si128(d1,ka); + d2 = _mm_aesenc_si128(d2,ka); + d3 = _mm_aesenc_si128(d3,ka); + kf = _mm_loadu_si128(reinterpret_cast(in + 32)); + d0 = _mm_aesenc_si128(d0,kb); + d1 = _mm_aesenc_si128(d1,kb); + d2 = _mm_aesenc_si128(d2,kb); + d3 = _mm_aesenc_si128(d3,kb); + ka = _mm_loadu_si128(reinterpret_cast(in + 48)); + d0 = _mm_aesenclast_si128(d0,kc); + d1 = _mm_aesenclast_si128(d1,kc); + d2 = _mm_aesenclast_si128(d2,kc); + d3 = _mm_aesenclast_si128(d3,kc); + kd = _mm_xor_si128(d0,kd); + ke = _mm_xor_si128(d1,ke); + kf = _mm_xor_si128(d2,kf); + ka = _mm_xor_si128(d3,ka); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka); - in += 64; - len -= 64; - out += 64; - } + in += 64; + len -= 64; + out += 64; + } - if (len >= 16) { - const __m128i k7 = k[7]; - const __m128i k8 = k[8]; - const __m128i k9 = k[9]; - const __m128i k10 = k[10]; - const __m128i k11 = k[11]; - const __m128i k12 = k[12]; - const __m128i k13 = k[13]; - const __m128i k14 = k[14]; - do { - __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - d0 = _mm_xor_si128(d0,k0); - d0 = _mm_aesenc_si128(d0,k1); - d0 = _mm_aesenc_si128(d0,k2); - d0 = _mm_aesenc_si128(d0,k3); - d0 = _mm_aesenc_si128(d0,k4); - d0 = _mm_aesenc_si128(d0,k5); - d0 = _mm_aesenc_si128(d0,k[6]); - d0 = _mm_aesenc_si128(d0,k7); - d0 = _mm_aesenc_si128(d0,k8); - d0 = _mm_aesenc_si128(d0,k9); - d0 = _mm_aesenc_si128(d0,k10); - d0 = _mm_aesenc_si128(d0,k11); - d0 = _mm_aesenc_si128(d0,k12); - d0 = _mm_aesenc_si128(d0,k13); - d0 = _mm_aesenclast_si128(d0,k14); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast(in)))); - - in += 16; - len -= 16; - out += 16; - } while (len >= 16); - } - } else { // in the unlikely case c1 is near uint64_max, we must add with carry - while (len >= 64) { + if (len >= 16) { + const __m128i k7 = k[7]; + const __m128i k8 = k[8]; + const __m128i k9 = k[9]; + const __m128i k10 = k[10]; + const __m128i k11 = k[11]; + const __m128i k12 = k[12]; + const __m128i k13 = k[13]; + const __m128i k14 = k[14]; + do { __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - d0 = _mm_xor_si128(d0,k0); - d1 = _mm_xor_si128(d1,k0); - d2 = _mm_xor_si128(d2,k0); - d3 = _mm_xor_si128(d3,k0); d0 = _mm_aesenc_si128(d0,k1); - d1 = _mm_aesenc_si128(d1,k1); - d2 = _mm_aesenc_si128(d2,k1); - d3 = _mm_aesenc_si128(d3,k1); - __m128i ka = k[6]; d0 = _mm_aesenc_si128(d0,k2); - d1 = _mm_aesenc_si128(d1,k2); - d2 = _mm_aesenc_si128(d2,k2); - d3 = _mm_aesenc_si128(d3,k2); - __m128i kb = k[7]; d0 = _mm_aesenc_si128(d0,k3); - d1 = _mm_aesenc_si128(d1,k3); - d2 = _mm_aesenc_si128(d2,k3); - d3 = _mm_aesenc_si128(d3,k3); - __m128i kc = k[8]; d0 = _mm_aesenc_si128(d0,k4); - d1 = _mm_aesenc_si128(d1,k4); - d2 = _mm_aesenc_si128(d2,k4); - d3 = _mm_aesenc_si128(d3,k4); - __m128i kd = k[9]; d0 = _mm_aesenc_si128(d0,k5); - d1 = _mm_aesenc_si128(d1,k5); - d2 = _mm_aesenc_si128(d2,k5); - d3 = _mm_aesenc_si128(d3,k5); - __m128i ke = k[10]; - d0 = _mm_aesenc_si128(d0,ka); - d1 = _mm_aesenc_si128(d1,ka); - d2 = _mm_aesenc_si128(d2,ka); - d3 = _mm_aesenc_si128(d3,ka); - __m128i kf = k[11]; - d0 = _mm_aesenc_si128(d0,kb); - d1 = _mm_aesenc_si128(d1,kb); - d2 = _mm_aesenc_si128(d2,kb); - d3 = _mm_aesenc_si128(d3,kb); - ka = k[12]; - d0 = _mm_aesenc_si128(d0,kc); - d1 = _mm_aesenc_si128(d1,kc); - d2 = _mm_aesenc_si128(d2,kc); - d3 = _mm_aesenc_si128(d3,kc); - kb = k[13]; - d0 = _mm_aesenc_si128(d0,kd); - d1 = _mm_aesenc_si128(d1,kd); - d2 = _mm_aesenc_si128(d2,kd); - d3 = _mm_aesenc_si128(d3,kd); - kc = k[14]; - d0 = _mm_aesenc_si128(d0,ke); - d1 = _mm_aesenc_si128(d1,ke); - d2 = _mm_aesenc_si128(d2,ke); - d3 = _mm_aesenc_si128(d3,ke); - kd = _mm_loadu_si128(reinterpret_cast(in)); - d0 = _mm_aesenc_si128(d0,kf); - d1 = _mm_aesenc_si128(d1,kf); - d2 = _mm_aesenc_si128(d2,kf); - d3 = _mm_aesenc_si128(d3,kf); - ke = _mm_loadu_si128(reinterpret_cast(in + 16)); - d0 = _mm_aesenc_si128(d0,ka); - d1 = _mm_aesenc_si128(d1,ka); - d2 = _mm_aesenc_si128(d2,ka); - d3 = _mm_aesenc_si128(d3,ka); - kf = _mm_loadu_si128(reinterpret_cast(in + 32)); - d0 = _mm_aesenc_si128(d0,kb); - d1 = _mm_aesenc_si128(d1,kb); - d2 = _mm_aesenc_si128(d2,kb); - d3 = _mm_aesenc_si128(d3,kb); - ka = _mm_loadu_si128(reinterpret_cast(in + 48)); - d0 = _mm_aesenclast_si128(d0,kc); - d1 = _mm_aesenclast_si128(d1,kc); - d2 = _mm_aesenclast_si128(d2,kc); - d3 = _mm_aesenclast_si128(d3,kc); - kd = _mm_xor_si128(d0,kd); - ke = _mm_xor_si128(d1,ke); - kf = _mm_xor_si128(d2,kf); - ka = _mm_xor_si128(d3,ka); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka); + d0 = _mm_aesenc_si128(d0,k[6]); + d0 = _mm_aesenc_si128(d0,k7); + d0 = _mm_aesenc_si128(d0,k8); + d0 = _mm_aesenc_si128(d0,k9); + d0 = _mm_aesenc_si128(d0,k10); + d0 = _mm_aesenc_si128(d0,k11); + d0 = _mm_aesenc_si128(d0,k12); + d0 = _mm_aesenc_si128(d0,k13); + d0 = _mm_aesenclast_si128(d0,k14); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast(in)))); - in += 64; - len -= 64; - out += 64; - } - - if (len >= 16) { - const __m128i k7 = k[7]; - const __m128i k8 = k[8]; - const __m128i k9 = k[9]; - const __m128i k10 = k[10]; - const __m128i k11 = k[11]; - const __m128i k12 = k[12]; - const __m128i k13 = k[13]; - const __m128i k14 = k[14]; - do { - __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); - if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL); - d0 = _mm_xor_si128(d0,k0); - d0 = _mm_aesenc_si128(d0,k1); - d0 = _mm_aesenc_si128(d0,k2); - d0 = _mm_aesenc_si128(d0,k3); - d0 = _mm_aesenc_si128(d0,k4); - d0 = _mm_aesenc_si128(d0,k5); - d0 = _mm_aesenc_si128(d0,k[6]); - d0 = _mm_aesenc_si128(d0,k7); - d0 = _mm_aesenc_si128(d0,k8); - d0 = _mm_aesenc_si128(d0,k9); - d0 = _mm_aesenc_si128(d0,k10); - d0 = _mm_aesenc_si128(d0,k11); - d0 = _mm_aesenc_si128(d0,k12); - d0 = _mm_aesenc_si128(d0,k13); - d0 = _mm_aesenclast_si128(d0,k14); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast(in)))); - - in += 16; - len -= 16; - out += 16; - } while (len >= 16); - } + in += 16; + len -= 16; + out += 16; + } while (len >= 16); } // Any remaining input is placed in _out. This will be picked up and crypted @@ -811,7 +677,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept } #endif - uint8_t keyStream[16]; + uint64_t keyStream[2]; unsigned int totalLen = _len; if ((totalLen & 15U)) { @@ -823,11 +689,11 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept --len; out[totalLen++] = *(in++); if (!(totalLen & 15U)) { - _aes._encryptSW(reinterpret_cast(_ctr),keyStream); + _aes._encryptSW(reinterpret_cast(_ctr),reinterpret_cast(keyStream)); uint8_t *outblk = out + (totalLen - 16); for(int i=0;i<16;++i) - outblk[i] ^= keyStream[i]; - if (unlikely((_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL)) == 0)) _ctr[0] = Utils::hton(Utils::ntoh(_ctr[0]) + 1ULL); + outblk[i] ^= reinterpret_cast(keyStream)[i]; + _ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL); break; } } @@ -836,15 +702,31 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept out += totalLen; _len = (totalLen + len); - while (len >= 16) { - _aes._encryptSW(reinterpret_cast(_ctr),keyStream); - for(int i=0;i<16;++i) - out[i] = in[i] ^ keyStream[i]; - out += 16; - len -= 16; - in += 16; - if (unlikely((_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL)) == 0)) _ctr[0] = Utils::hton(Utils::ntoh(_ctr[0]) + 1ULL); +#ifdef ZT_NO_UNALIGNED_ACCESS + if ((((uintptr_t)out | (uintptr_t)in) & 7U) == 0) { // if aligned we can do XORs in quadwords instead of bytes +#endif + while (len >= 16) { + _aes._encryptSW(reinterpret_cast(_ctr),reinterpret_cast(keyStream)); + reinterpret_cast(out)[0] = reinterpret_cast(in)[0] ^ keyStream[0]; + reinterpret_cast(out)[1] = reinterpret_cast(in)[1] ^ keyStream[1]; + out += 16; + len -= 16; + in += 16; + _ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL); + } +#ifdef ZT_NO_UNALIGNED_ACCESS + } else { + while (len >= 16) { + _aes._encryptSW(reinterpret_cast(_ctr),reinterpret_cast(keyStream)); + for (int i = 0;i < 16;++i) + out[i] = in[i] ^ reinterpret_cast(keyStream)[i]; + out += 16; + len -= 16; + in += 16; + _ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL); + } } +#endif // Any remaining input is placed in _out. This will be picked up and crypted // on subsequent calls to crypt() or finish() as it'll mean _len will not be @@ -898,29 +780,8 @@ void AES::CTR::finish() noexcept // Software AES and AES key expansion --------------------------------------------------------------------------------- -#ifdef ZT_NO_UNALIGNED_ACCESS -static ZT_INLINE uint32_t readuint32_t(const void *in) -{ - uint32_t v = ((const uint8_t *)in)[0]; - v <<= 8; - v |= ((const uint8_t *)in)[1]; - v <<= 8; - v |= ((const uint8_t *)in)[2]; - v <<= 8; - v |= ((const uint8_t *)in)[3]; - return v; -} -static ZT_INLINE void writeuint32_t(void *out,const uint32_t v) -{ - ((uint8_t *)out)[0] = (uint8_t)(v >> 24); - ((uint8_t *)out)[1] = (uint8_t)(v >> 16); - ((uint8_t *)out)[2] = (uint8_t)(v >> 8); - ((uint8_t *)out)[3] = (uint8_t)v; -} -#else -#define readuint32_t(i) (Utils::ntoh(*((const uint32_t *)(i)))) -#define writeuint32_t(o,v) (*((uint32_t *)(o)) = Utils::hton(v)) -#endif +#define readuint32_t(i) Utils::loadBigEndian(i) +#define writeuint32_t(o,v) Utils::storeBigEndian((o),(uint32_t)(v)) const uint32_t AES::Te0[256] = { 0xc66363a5,0xf87c7c84,0xee777799,0xf67b7b8d,0xfff2f20d,0xd66b6bbd,0xde6f6fb1,0x91c5c554,0x60303050,0x02010103,0xce6767a9,0x562b2b7d,0xe7fefe19,0xb5d7d762,0x4dababe6,0xec76769a,0x8fcaca45,0x1f82829d,0x89c9c940,0xfa7d7d87,0xeffafa15,0xb25959eb,0x8e4747c9,0xfbf0f00b,0x41adadec,0xb3d4d467,0x5fa2a2fd,0x45afafea,0x239c9cbf,0x53a4a4f7,0xe4727296,0x9bc0c05b,0x75b7b7c2,0xe1fdfd1c,0x3d9393ae,0x4c26266a,0x6c36365a,0x7e3f3f41,0xf5f7f702,0x83cccc4f,0x6834345c,0x51a5a5f4,0xd1e5e534,0xf9f1f108,0xe2717193,0xabd8d873,0x62313153,0x2a15153f,0x0804040c,0x95c7c752,0x46232365,0x9dc3c35e,0x30181828,0x379696a1,0x0a05050f,0x2f9a9ab5,0x0e070709,0x24121236,0x1b80809b,0xdfe2e23d,0xcdebeb26,0x4e272769,0x7fb2b2cd,0xea75759f,0x1209091b,0x1d83839e,0x582c2c74,0x341a1a2e,0x361b1b2d,0xdc6e6eb2,0xb45a5aee,0x5ba0a0fb,0xa45252f6,0x763b3b4d,0xb7d6d661,0x7db3b3ce,0x5229297b,0xdde3e33e,0x5e2f2f71,0x13848497,0xa65353f5,0xb9d1d168,0x00000000,0xc1eded2c,0x40202060,0xe3fcfc1f,0x79b1b1c8,0xb65b5bed,0xd46a6abe,0x8dcbcb46,0x67bebed9,0x7239394b,0x944a4ade,0x984c4cd4,0xb05858e8,0x85cfcf4a,0xbbd0d06b,0xc5efef2a,0x4faaaae5,0xedfbfb16,0x864343c5,0x9a4d4dd7,0x66333355,0x11858594,0x8a4545cf,0xe9f9f910,0x04020206,0xfe7f7f81,0xa05050f0,0x783c3c44,0x259f9fba,0x4ba8a8e3,0xa25151f3,0x5da3a3fe,0x804040c0,0x058f8f8a,0x3f9292ad,0x219d9dbc,0x70383848,0xf1f5f504,0x63bcbcdf,0x77b6b6c1,0xafdada75,0x42212163,0x20101030,0xe5ffff1a,0xfdf3f30e,0xbfd2d26d,0x81cdcd4c,0x180c0c14,0x26131335,0xc3ecec2f,0xbe5f5fe1,0x359797a2,0x884444cc,0x2e171739,0x93c4c457,0x55a7a7f2,0xfc7e7e82,0x7a3d3d47,0xc86464ac,0xba5d5de7,0x3219192b,0xe6737395,0xc06060a0,0x19818198,0x9e4f4fd1,0xa3dcdc7f,0x44222266,0x542a2a7e,0x3b9090ab,0x0b888883,0x8c4646ca,0xc7eeee29,0x6bb8b8d3,0x2814143c,0xa7dede79,0xbc5e5ee2,0x160b0b1d,0xaddbdb76,0xdbe0e03b,0x64323256,0x743a3a4e,0x140a0a1e,0x924949db,0x0c06060a,0x4824246c,0xb85c5ce4,0x9fc2c25d,0xbdd3d36e,0x43acacef,0xc46262a6,0x399191a8,0x319595a4,0xd3e4e437,0xf279798b,0xd5e7e732,0x8bc8c843,0x6e373759,0xda6d6db7,0x018d8d8c,0xb1d5d564,0x9c4e4ed2,0x49a9a9e0,0xd86c6cb4,0xac5656fa,0xf3f4f407,0xcfeaea25,0xca6565af,0xf47a7a8e,0x47aeaee9,0x10080818,0x6fbabad5,0xf0787888,0x4a25256f,0x5c2e2e72,0x381c1c24,0x57a6a6f1,0x73b4b4c7,0x97c6c651,0xcbe8e823,0xa1dddd7c,0xe874749c,0x3e1f1f21,0x964b4bdd,0x61bdbddc,0x0d8b8b86,0x0f8a8a85,0xe0707090,0x7c3e3e42,0x71b5b5c4,0xcc6666aa,0x904848d8,0x06030305,0xf7f6f601,0x1c0e0e12,0xc26161a3,0x6a35355f,0xae5757f9,0x69b9b9d0,0x17868691,0x99c1c158,0x3a1d1d27,0x279e9eb9,0xd9e1e138,0xebf8f813,0x2b9898b3,0x22111133,0xd26969bb,0xa9d9d970,0x078e8e89,0x339494a7,0x2d9b9bb6,0x3c1e1e22,0x15878792,0xc9e9e920,0x87cece49,0xaa5555ff,0x50282878,0xa5dfdf7a,0x038c8c8f,0x59a1a1f8,0x09898980,0x1a0d0d17,0x65bfbfda,0xd7e6e631,0x844242c6,0xd06868b8,0x824141c3,0x299999b0,0x5a2d2d77,0x1e0f0f11,0x7bb0b0cb,0xa85454fc,0x6dbbbbd6,0x2c16163a }; const uint32_t AES::Te1[256] = { 0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b,0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5,0x50603030,0x03020101,0xa9ce6767,0x7d562b2b,0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676,0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d,0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0,0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf,0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0,0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626,0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc,0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1,0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515,0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3,0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a,0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2,0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575,0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a,0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0,0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3,0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484,0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded,0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b,0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939,0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf,0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb,0xc5864343,0xd79a4d4d,0x55663333,0x94118585,0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f,0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8,0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f,0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5,0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121,0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2,0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec,0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717,0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d,0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373,0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc,0x66442222,0x7e542a2a,0xab3b9090,0x830b8888,0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414,0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb,0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a,0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c,0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262,0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979,0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d,0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9,0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea,0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808,0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e,0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6,0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f,0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a,0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666,0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e,0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9,0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e,0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111,0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494,0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9,0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf,0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d,0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868,0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f,0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616 }; diff --git a/node/AES.hpp b/node/AES.hpp index f86db33d3..8f83ced5d 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -259,7 +259,10 @@ public: */ ZT_INLINE void init(const uint64_t iv,void *const output) noexcept { + // Output buffer to receive the result of AES-CTR encryption. _output = output; + + // Initialize GMAC with 64-bit IV (and remaining 32 bits padded to zero). _iv[0] = iv; _iv[1] = 0; _gmac.init(reinterpret_cast(_iv)); @@ -276,7 +279,11 @@ public: */ ZT_INLINE void aad(const void *const aad,unsigned int len) noexcept { + // Feed ADD into GMAC first _gmac.update(aad,len); + + // End of AAD is padded to a multiple of 16 bytes to ensure unique encoding vs. plaintext. + // AES-GCM-SIV does this as well for the same reason. len &= 0xfU; if (len != 0) _gmac.update(Utils::ZERO256,16 - len); @@ -298,11 +305,23 @@ public: */ ZT_INLINE void finish1() noexcept { + // Compute GMAC tag, then encrypt the original 64-bit IV and the first 64 bits + // of the GMAC tag with AES (single block) and use this to initialize AES-CTR. uint64_t gmacTag[2]; _gmac.finish(reinterpret_cast(gmacTag)); _iv[1] = gmacTag[0]; _ctr._aes.encrypt(_iv,_iv); - _ctr.init(reinterpret_cast(_iv),_output); + + // Bit 31 of the CTR IV is masked to (1) allow us to optimize by forgetting + // about integer overflow for less than 2^31 bytes (which is far less than + // this system's max message size), and (2) ensure interoperability with any + // future FIPS-compliant or other cryptographic libraries that may or may not + // handle 32-bit integer overflow of the least significant 32 bits in the + // counter in the expected way. + uint64_t ctrIv[2]; + ctrIv[0] = _iv[0]; + ctrIv[1] = _iv[1] & ZT_CONST_TO_BE_UINT64(0xffffffff7fffffffULL); + _ctr.init(reinterpret_cast(ctrIv),_output); } /**