mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-06-05 20:13:44 +02:00
(1) Mask bit 31 of AES-CTR nonce to disallow uint32 overflow, (2) get rid of handling of uint32/uint64 overflow in our AES-CTR code to optimize, (3) optimize AES software a bit
This commit is contained in:
parent
2d862f1d32
commit
d151d731a6
2 changed files with 165 additions and 285 deletions
429
node/AES.cpp
429
node/AES.cpp
|
@ -535,266 +535,132 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
out += totalLen;
|
||||
_len = (totalLen + len);
|
||||
|
||||
if (likely((c1 + len) > c1)) { // if this is true we can just increment c1 and ignore c0
|
||||
while (len >= 64) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
||||
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
||||
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
||||
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
||||
c1 += 4;
|
||||
while (len >= 64) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
||||
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
||||
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
||||
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
||||
c1 += 4;
|
||||
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d1 = _mm_xor_si128(d1,k0);
|
||||
d2 = _mm_xor_si128(d2,k0);
|
||||
d3 = _mm_xor_si128(d3,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d1 = _mm_aesenc_si128(d1,k1);
|
||||
d2 = _mm_aesenc_si128(d2,k1);
|
||||
d3 = _mm_aesenc_si128(d3,k1);
|
||||
__m128i ka = k[6];
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d1 = _mm_aesenc_si128(d1,k2);
|
||||
d2 = _mm_aesenc_si128(d2,k2);
|
||||
d3 = _mm_aesenc_si128(d3,k2);
|
||||
__m128i kb = k[7];
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d1 = _mm_aesenc_si128(d1,k3);
|
||||
d2 = _mm_aesenc_si128(d2,k3);
|
||||
d3 = _mm_aesenc_si128(d3,k3);
|
||||
__m128i kc = k[8];
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d1 = _mm_aesenc_si128(d1,k4);
|
||||
d2 = _mm_aesenc_si128(d2,k4);
|
||||
d3 = _mm_aesenc_si128(d3,k4);
|
||||
__m128i kd = k[9];
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d1 = _mm_aesenc_si128(d1,k5);
|
||||
d2 = _mm_aesenc_si128(d2,k5);
|
||||
d3 = _mm_aesenc_si128(d3,k5);
|
||||
__m128i ke = k[10];
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
__m128i kf = k[11];
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = k[12];
|
||||
d0 = _mm_aesenc_si128(d0,kc);
|
||||
d1 = _mm_aesenc_si128(d1,kc);
|
||||
d2 = _mm_aesenc_si128(d2,kc);
|
||||
d3 = _mm_aesenc_si128(d3,kc);
|
||||
kb = k[13];
|
||||
d0 = _mm_aesenc_si128(d0,kd);
|
||||
d1 = _mm_aesenc_si128(d1,kd);
|
||||
d2 = _mm_aesenc_si128(d2,kd);
|
||||
d3 = _mm_aesenc_si128(d3,kd);
|
||||
kc = k[14];
|
||||
d0 = _mm_aesenc_si128(d0,ke);
|
||||
d1 = _mm_aesenc_si128(d1,ke);
|
||||
d2 = _mm_aesenc_si128(d2,ke);
|
||||
d3 = _mm_aesenc_si128(d3,ke);
|
||||
kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
d0 = _mm_aesenc_si128(d0,kf);
|
||||
d1 = _mm_aesenc_si128(d1,kf);
|
||||
d2 = _mm_aesenc_si128(d2,kf);
|
||||
d3 = _mm_aesenc_si128(d3,kf);
|
||||
ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
d0 = _mm_aesenclast_si128(d0,kc);
|
||||
d1 = _mm_aesenclast_si128(d1,kc);
|
||||
d2 = _mm_aesenclast_si128(d2,kc);
|
||||
d3 = _mm_aesenclast_si128(d3,kc);
|
||||
kd = _mm_xor_si128(d0,kd);
|
||||
ke = _mm_xor_si128(d1,ke);
|
||||
kf = _mm_xor_si128(d2,kf);
|
||||
ka = _mm_xor_si128(d3,ka);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d1 = _mm_xor_si128(d1,k0);
|
||||
d2 = _mm_xor_si128(d2,k0);
|
||||
d3 = _mm_xor_si128(d3,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d1 = _mm_aesenc_si128(d1,k1);
|
||||
d2 = _mm_aesenc_si128(d2,k1);
|
||||
d3 = _mm_aesenc_si128(d3,k1);
|
||||
__m128i ka = k[6];
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d1 = _mm_aesenc_si128(d1,k2);
|
||||
d2 = _mm_aesenc_si128(d2,k2);
|
||||
d3 = _mm_aesenc_si128(d3,k2);
|
||||
__m128i kb = k[7];
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d1 = _mm_aesenc_si128(d1,k3);
|
||||
d2 = _mm_aesenc_si128(d2,k3);
|
||||
d3 = _mm_aesenc_si128(d3,k3);
|
||||
__m128i kc = k[8];
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d1 = _mm_aesenc_si128(d1,k4);
|
||||
d2 = _mm_aesenc_si128(d2,k4);
|
||||
d3 = _mm_aesenc_si128(d3,k4);
|
||||
__m128i kd = k[9];
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d1 = _mm_aesenc_si128(d1,k5);
|
||||
d2 = _mm_aesenc_si128(d2,k5);
|
||||
d3 = _mm_aesenc_si128(d3,k5);
|
||||
__m128i ke = k[10];
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
__m128i kf = k[11];
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = k[12];
|
||||
d0 = _mm_aesenc_si128(d0,kc);
|
||||
d1 = _mm_aesenc_si128(d1,kc);
|
||||
d2 = _mm_aesenc_si128(d2,kc);
|
||||
d3 = _mm_aesenc_si128(d3,kc);
|
||||
kb = k[13];
|
||||
d0 = _mm_aesenc_si128(d0,kd);
|
||||
d1 = _mm_aesenc_si128(d1,kd);
|
||||
d2 = _mm_aesenc_si128(d2,kd);
|
||||
d3 = _mm_aesenc_si128(d3,kd);
|
||||
kc = k[14];
|
||||
d0 = _mm_aesenc_si128(d0,ke);
|
||||
d1 = _mm_aesenc_si128(d1,ke);
|
||||
d2 = _mm_aesenc_si128(d2,ke);
|
||||
d3 = _mm_aesenc_si128(d3,ke);
|
||||
kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
d0 = _mm_aesenc_si128(d0,kf);
|
||||
d1 = _mm_aesenc_si128(d1,kf);
|
||||
d2 = _mm_aesenc_si128(d2,kf);
|
||||
d3 = _mm_aesenc_si128(d3,kf);
|
||||
ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
d0 = _mm_aesenclast_si128(d0,kc);
|
||||
d1 = _mm_aesenclast_si128(d1,kc);
|
||||
d2 = _mm_aesenclast_si128(d2,kc);
|
||||
d3 = _mm_aesenclast_si128(d3,kc);
|
||||
kd = _mm_xor_si128(d0,kd);
|
||||
ke = _mm_xor_si128(d1,ke);
|
||||
kf = _mm_xor_si128(d2,kf);
|
||||
ka = _mm_xor_si128(d3,ka);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
||||
|
||||
in += 64;
|
||||
len -= 64;
|
||||
out += 64;
|
||||
}
|
||||
in += 64;
|
||||
len -= 64;
|
||||
out += 64;
|
||||
}
|
||||
|
||||
if (len >= 16) {
|
||||
const __m128i k7 = k[7];
|
||||
const __m128i k8 = k[8];
|
||||
const __m128i k9 = k[9];
|
||||
const __m128i k10 = k[10];
|
||||
const __m128i k11 = k[11];
|
||||
const __m128i k12 = k[12];
|
||||
const __m128i k13 = k[13];
|
||||
const __m128i k14 = k[14];
|
||||
do {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k7);
|
||||
d0 = _mm_aesenc_si128(d0,k8);
|
||||
d0 = _mm_aesenc_si128(d0,k9);
|
||||
d0 = _mm_aesenc_si128(d0,k10);
|
||||
d0 = _mm_aesenc_si128(d0,k11);
|
||||
d0 = _mm_aesenc_si128(d0,k12);
|
||||
d0 = _mm_aesenc_si128(d0,k13);
|
||||
d0 = _mm_aesenclast_si128(d0,k14);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
||||
|
||||
in += 16;
|
||||
len -= 16;
|
||||
out += 16;
|
||||
} while (len >= 16);
|
||||
}
|
||||
} else { // in the unlikely case c1 is near uint64_max, we must add with carry
|
||||
while (len >= 64) {
|
||||
if (len >= 16) {
|
||||
const __m128i k7 = k[7];
|
||||
const __m128i k8 = k[8];
|
||||
const __m128i k9 = k[9];
|
||||
const __m128i k10 = k[10];
|
||||
const __m128i k11 = k[11];
|
||||
const __m128i k12 = k[12];
|
||||
const __m128i k13 = k[13];
|
||||
const __m128i k14 = k[14];
|
||||
do {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
||||
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
||||
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
||||
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
||||
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d1 = _mm_xor_si128(d1,k0);
|
||||
d2 = _mm_xor_si128(d2,k0);
|
||||
d3 = _mm_xor_si128(d3,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d1 = _mm_aesenc_si128(d1,k1);
|
||||
d2 = _mm_aesenc_si128(d2,k1);
|
||||
d3 = _mm_aesenc_si128(d3,k1);
|
||||
__m128i ka = k[6];
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d1 = _mm_aesenc_si128(d1,k2);
|
||||
d2 = _mm_aesenc_si128(d2,k2);
|
||||
d3 = _mm_aesenc_si128(d3,k2);
|
||||
__m128i kb = k[7];
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d1 = _mm_aesenc_si128(d1,k3);
|
||||
d2 = _mm_aesenc_si128(d2,k3);
|
||||
d3 = _mm_aesenc_si128(d3,k3);
|
||||
__m128i kc = k[8];
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d1 = _mm_aesenc_si128(d1,k4);
|
||||
d2 = _mm_aesenc_si128(d2,k4);
|
||||
d3 = _mm_aesenc_si128(d3,k4);
|
||||
__m128i kd = k[9];
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d1 = _mm_aesenc_si128(d1,k5);
|
||||
d2 = _mm_aesenc_si128(d2,k5);
|
||||
d3 = _mm_aesenc_si128(d3,k5);
|
||||
__m128i ke = k[10];
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
__m128i kf = k[11];
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = k[12];
|
||||
d0 = _mm_aesenc_si128(d0,kc);
|
||||
d1 = _mm_aesenc_si128(d1,kc);
|
||||
d2 = _mm_aesenc_si128(d2,kc);
|
||||
d3 = _mm_aesenc_si128(d3,kc);
|
||||
kb = k[13];
|
||||
d0 = _mm_aesenc_si128(d0,kd);
|
||||
d1 = _mm_aesenc_si128(d1,kd);
|
||||
d2 = _mm_aesenc_si128(d2,kd);
|
||||
d3 = _mm_aesenc_si128(d3,kd);
|
||||
kc = k[14];
|
||||
d0 = _mm_aesenc_si128(d0,ke);
|
||||
d1 = _mm_aesenc_si128(d1,ke);
|
||||
d2 = _mm_aesenc_si128(d2,ke);
|
||||
d3 = _mm_aesenc_si128(d3,ke);
|
||||
kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
d0 = _mm_aesenc_si128(d0,kf);
|
||||
d1 = _mm_aesenc_si128(d1,kf);
|
||||
d2 = _mm_aesenc_si128(d2,kf);
|
||||
d3 = _mm_aesenc_si128(d3,kf);
|
||||
ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
d0 = _mm_aesenclast_si128(d0,kc);
|
||||
d1 = _mm_aesenclast_si128(d1,kc);
|
||||
d2 = _mm_aesenclast_si128(d2,kc);
|
||||
d3 = _mm_aesenclast_si128(d3,kc);
|
||||
kd = _mm_xor_si128(d0,kd);
|
||||
ke = _mm_xor_si128(d1,ke);
|
||||
kf = _mm_xor_si128(d2,kf);
|
||||
ka = _mm_xor_si128(d3,ka);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k7);
|
||||
d0 = _mm_aesenc_si128(d0,k8);
|
||||
d0 = _mm_aesenc_si128(d0,k9);
|
||||
d0 = _mm_aesenc_si128(d0,k10);
|
||||
d0 = _mm_aesenc_si128(d0,k11);
|
||||
d0 = _mm_aesenc_si128(d0,k12);
|
||||
d0 = _mm_aesenc_si128(d0,k13);
|
||||
d0 = _mm_aesenclast_si128(d0,k14);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
||||
|
||||
in += 64;
|
||||
len -= 64;
|
||||
out += 64;
|
||||
}
|
||||
|
||||
if (len >= 16) {
|
||||
const __m128i k7 = k[7];
|
||||
const __m128i k8 = k[8];
|
||||
const __m128i k9 = k[9];
|
||||
const __m128i k10 = k[10];
|
||||
const __m128i k11 = k[11];
|
||||
const __m128i k12 = k[12];
|
||||
const __m128i k13 = k[13];
|
||||
const __m128i k14 = k[14];
|
||||
do {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k7);
|
||||
d0 = _mm_aesenc_si128(d0,k8);
|
||||
d0 = _mm_aesenc_si128(d0,k9);
|
||||
d0 = _mm_aesenc_si128(d0,k10);
|
||||
d0 = _mm_aesenc_si128(d0,k11);
|
||||
d0 = _mm_aesenc_si128(d0,k12);
|
||||
d0 = _mm_aesenc_si128(d0,k13);
|
||||
d0 = _mm_aesenclast_si128(d0,k14);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
||||
|
||||
in += 16;
|
||||
len -= 16;
|
||||
out += 16;
|
||||
} while (len >= 16);
|
||||
}
|
||||
in += 16;
|
||||
len -= 16;
|
||||
out += 16;
|
||||
} while (len >= 16);
|
||||
}
|
||||
|
||||
// Any remaining input is placed in _out. This will be picked up and crypted
|
||||
|
@ -811,7 +677,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
}
|
||||
#endif
|
||||
|
||||
uint8_t keyStream[16];
|
||||
uint64_t keyStream[2];
|
||||
|
||||
unsigned int totalLen = _len;
|
||||
if ((totalLen & 15U)) {
|
||||
|
@ -823,11 +689,11 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
--len;
|
||||
out[totalLen++] = *(in++);
|
||||
if (!(totalLen & 15U)) {
|
||||
_aes._encryptSW(reinterpret_cast<const uint8_t *>(_ctr),keyStream);
|
||||
_aes._encryptSW(reinterpret_cast<const uint8_t *>(_ctr),reinterpret_cast<uint8_t *>(keyStream));
|
||||
uint8_t *outblk = out + (totalLen - 16);
|
||||
for(int i=0;i<16;++i)
|
||||
outblk[i] ^= keyStream[i];
|
||||
if (unlikely((_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL)) == 0)) _ctr[0] = Utils::hton(Utils::ntoh(_ctr[0]) + 1ULL);
|
||||
outblk[i] ^= reinterpret_cast<uint8_t *>(keyStream)[i];
|
||||
_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -836,15 +702,31 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
out += totalLen;
|
||||
_len = (totalLen + len);
|
||||
|
||||
while (len >= 16) {
|
||||
_aes._encryptSW(reinterpret_cast<const uint8_t *>(_ctr),keyStream);
|
||||
for(int i=0;i<16;++i)
|
||||
out[i] = in[i] ^ keyStream[i];
|
||||
out += 16;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
if (unlikely((_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL)) == 0)) _ctr[0] = Utils::hton(Utils::ntoh(_ctr[0]) + 1ULL);
|
||||
#ifdef ZT_NO_UNALIGNED_ACCESS
|
||||
if ((((uintptr_t)out | (uintptr_t)in) & 7U) == 0) { // if aligned we can do XORs in quadwords instead of bytes
|
||||
#endif
|
||||
while (len >= 16) {
|
||||
_aes._encryptSW(reinterpret_cast<const uint8_t *>(_ctr),reinterpret_cast<uint8_t *>(keyStream));
|
||||
reinterpret_cast<uint64_t *>(out)[0] = reinterpret_cast<const uint64_t *>(in)[0] ^ keyStream[0];
|
||||
reinterpret_cast<uint64_t *>(out)[1] = reinterpret_cast<const uint64_t *>(in)[1] ^ keyStream[1];
|
||||
out += 16;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL);
|
||||
}
|
||||
#ifdef ZT_NO_UNALIGNED_ACCESS
|
||||
} else {
|
||||
while (len >= 16) {
|
||||
_aes._encryptSW(reinterpret_cast<const uint8_t *>(_ctr),reinterpret_cast<uint8_t *>(keyStream));
|
||||
for (int i = 0;i < 16;++i)
|
||||
out[i] = in[i] ^ reinterpret_cast<uint8_t *>(keyStream)[i];
|
||||
out += 16;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
_ctr[1] = Utils::hton(Utils::ntoh(_ctr[1]) + 1ULL);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Any remaining input is placed in _out. This will be picked up and crypted
|
||||
// on subsequent calls to crypt() or finish() as it'll mean _len will not be
|
||||
|
@ -898,29 +780,8 @@ void AES::CTR::finish() noexcept
|
|||
|
||||
// Software AES and AES key expansion ---------------------------------------------------------------------------------
|
||||
|
||||
#ifdef ZT_NO_UNALIGNED_ACCESS
|
||||
static ZT_INLINE uint32_t readuint32_t(const void *in)
|
||||
{
|
||||
uint32_t v = ((const uint8_t *)in)[0];
|
||||
v <<= 8;
|
||||
v |= ((const uint8_t *)in)[1];
|
||||
v <<= 8;
|
||||
v |= ((const uint8_t *)in)[2];
|
||||
v <<= 8;
|
||||
v |= ((const uint8_t *)in)[3];
|
||||
return v;
|
||||
}
|
||||
static ZT_INLINE void writeuint32_t(void *out,const uint32_t v)
|
||||
{
|
||||
((uint8_t *)out)[0] = (uint8_t)(v >> 24);
|
||||
((uint8_t *)out)[1] = (uint8_t)(v >> 16);
|
||||
((uint8_t *)out)[2] = (uint8_t)(v >> 8);
|
||||
((uint8_t *)out)[3] = (uint8_t)v;
|
||||
}
|
||||
#else
|
||||
#define readuint32_t(i) (Utils::ntoh(*((const uint32_t *)(i))))
|
||||
#define writeuint32_t(o,v) (*((uint32_t *)(o)) = Utils::hton(v))
|
||||
#endif
|
||||
#define readuint32_t(i) Utils::loadBigEndian<uint32_t>(i)
|
||||
#define writeuint32_t(o,v) Utils::storeBigEndian((o),(uint32_t)(v))
|
||||
|
||||
const uint32_t AES::Te0[256] = { 0xc66363a5,0xf87c7c84,0xee777799,0xf67b7b8d,0xfff2f20d,0xd66b6bbd,0xde6f6fb1,0x91c5c554,0x60303050,0x02010103,0xce6767a9,0x562b2b7d,0xe7fefe19,0xb5d7d762,0x4dababe6,0xec76769a,0x8fcaca45,0x1f82829d,0x89c9c940,0xfa7d7d87,0xeffafa15,0xb25959eb,0x8e4747c9,0xfbf0f00b,0x41adadec,0xb3d4d467,0x5fa2a2fd,0x45afafea,0x239c9cbf,0x53a4a4f7,0xe4727296,0x9bc0c05b,0x75b7b7c2,0xe1fdfd1c,0x3d9393ae,0x4c26266a,0x6c36365a,0x7e3f3f41,0xf5f7f702,0x83cccc4f,0x6834345c,0x51a5a5f4,0xd1e5e534,0xf9f1f108,0xe2717193,0xabd8d873,0x62313153,0x2a15153f,0x0804040c,0x95c7c752,0x46232365,0x9dc3c35e,0x30181828,0x379696a1,0x0a05050f,0x2f9a9ab5,0x0e070709,0x24121236,0x1b80809b,0xdfe2e23d,0xcdebeb26,0x4e272769,0x7fb2b2cd,0xea75759f,0x1209091b,0x1d83839e,0x582c2c74,0x341a1a2e,0x361b1b2d,0xdc6e6eb2,0xb45a5aee,0x5ba0a0fb,0xa45252f6,0x763b3b4d,0xb7d6d661,0x7db3b3ce,0x5229297b,0xdde3e33e,0x5e2f2f71,0x13848497,0xa65353f5,0xb9d1d168,0x00000000,0xc1eded2c,0x40202060,0xe3fcfc1f,0x79b1b1c8,0xb65b5bed,0xd46a6abe,0x8dcbcb46,0x67bebed9,0x7239394b,0x944a4ade,0x984c4cd4,0xb05858e8,0x85cfcf4a,0xbbd0d06b,0xc5efef2a,0x4faaaae5,0xedfbfb16,0x864343c5,0x9a4d4dd7,0x66333355,0x11858594,0x8a4545cf,0xe9f9f910,0x04020206,0xfe7f7f81,0xa05050f0,0x783c3c44,0x259f9fba,0x4ba8a8e3,0xa25151f3,0x5da3a3fe,0x804040c0,0x058f8f8a,0x3f9292ad,0x219d9dbc,0x70383848,0xf1f5f504,0x63bcbcdf,0x77b6b6c1,0xafdada75,0x42212163,0x20101030,0xe5ffff1a,0xfdf3f30e,0xbfd2d26d,0x81cdcd4c,0x180c0c14,0x26131335,0xc3ecec2f,0xbe5f5fe1,0x359797a2,0x884444cc,0x2e171739,0x93c4c457,0x55a7a7f2,0xfc7e7e82,0x7a3d3d47,0xc86464ac,0xba5d5de7,0x3219192b,0xe6737395,0xc06060a0,0x19818198,0x9e4f4fd1,0xa3dcdc7f,0x44222266,0x542a2a7e,0x3b9090ab,0x0b888883,0x8c4646ca,0xc7eeee29,0x6bb8b8d3,0x2814143c,0xa7dede79,0xbc5e5ee2,0x160b0b1d,0xaddbdb76,0xdbe0e03b,0x64323256,0x743a3a4e,0x140a0a1e,0x924949db,0x0c06060a,0x4824246c,0xb85c5ce4,0x9fc2c25d,0xbdd3d36e,0x43acacef,0xc46262a6,0x399191a8,0x319595a4,0xd3e4e437,0xf279798b,0xd5e7e732,0x8bc8c843,0x6e373759,0xda6d6db7,0x018d8d8c,0xb1d5d564,0x9c4e4ed2,0x49a9a9e0,0xd86c6cb4,0xac5656fa,0xf3f4f407,0xcfeaea25,0xca6565af,0xf47a7a8e,0x47aeaee9,0x10080818,0x6fbabad5,0xf0787888,0x4a25256f,0x5c2e2e72,0x381c1c24,0x57a6a6f1,0x73b4b4c7,0x97c6c651,0xcbe8e823,0xa1dddd7c,0xe874749c,0x3e1f1f21,0x964b4bdd,0x61bdbddc,0x0d8b8b86,0x0f8a8a85,0xe0707090,0x7c3e3e42,0x71b5b5c4,0xcc6666aa,0x904848d8,0x06030305,0xf7f6f601,0x1c0e0e12,0xc26161a3,0x6a35355f,0xae5757f9,0x69b9b9d0,0x17868691,0x99c1c158,0x3a1d1d27,0x279e9eb9,0xd9e1e138,0xebf8f813,0x2b9898b3,0x22111133,0xd26969bb,0xa9d9d970,0x078e8e89,0x339494a7,0x2d9b9bb6,0x3c1e1e22,0x15878792,0xc9e9e920,0x87cece49,0xaa5555ff,0x50282878,0xa5dfdf7a,0x038c8c8f,0x59a1a1f8,0x09898980,0x1a0d0d17,0x65bfbfda,0xd7e6e631,0x844242c6,0xd06868b8,0x824141c3,0x299999b0,0x5a2d2d77,0x1e0f0f11,0x7bb0b0cb,0xa85454fc,0x6dbbbbd6,0x2c16163a };
|
||||
const uint32_t AES::Te1[256] = { 0xa5c66363,0x84f87c7c,0x99ee7777,0x8df67b7b,0x0dfff2f2,0xbdd66b6b,0xb1de6f6f,0x5491c5c5,0x50603030,0x03020101,0xa9ce6767,0x7d562b2b,0x19e7fefe,0x62b5d7d7,0xe64dabab,0x9aec7676,0x458fcaca,0x9d1f8282,0x4089c9c9,0x87fa7d7d,0x15effafa,0xebb25959,0xc98e4747,0x0bfbf0f0,0xec41adad,0x67b3d4d4,0xfd5fa2a2,0xea45afaf,0xbf239c9c,0xf753a4a4,0x96e47272,0x5b9bc0c0,0xc275b7b7,0x1ce1fdfd,0xae3d9393,0x6a4c2626,0x5a6c3636,0x417e3f3f,0x02f5f7f7,0x4f83cccc,0x5c683434,0xf451a5a5,0x34d1e5e5,0x08f9f1f1,0x93e27171,0x73abd8d8,0x53623131,0x3f2a1515,0x0c080404,0x5295c7c7,0x65462323,0x5e9dc3c3,0x28301818,0xa1379696,0x0f0a0505,0xb52f9a9a,0x090e0707,0x36241212,0x9b1b8080,0x3ddfe2e2,0x26cdebeb,0x694e2727,0xcd7fb2b2,0x9fea7575,0x1b120909,0x9e1d8383,0x74582c2c,0x2e341a1a,0x2d361b1b,0xb2dc6e6e,0xeeb45a5a,0xfb5ba0a0,0xf6a45252,0x4d763b3b,0x61b7d6d6,0xce7db3b3,0x7b522929,0x3edde3e3,0x715e2f2f,0x97138484,0xf5a65353,0x68b9d1d1,0x00000000,0x2cc1eded,0x60402020,0x1fe3fcfc,0xc879b1b1,0xedb65b5b,0xbed46a6a,0x468dcbcb,0xd967bebe,0x4b723939,0xde944a4a,0xd4984c4c,0xe8b05858,0x4a85cfcf,0x6bbbd0d0,0x2ac5efef,0xe54faaaa,0x16edfbfb,0xc5864343,0xd79a4d4d,0x55663333,0x94118585,0xcf8a4545,0x10e9f9f9,0x06040202,0x81fe7f7f,0xf0a05050,0x44783c3c,0xba259f9f,0xe34ba8a8,0xf3a25151,0xfe5da3a3,0xc0804040,0x8a058f8f,0xad3f9292,0xbc219d9d,0x48703838,0x04f1f5f5,0xdf63bcbc,0xc177b6b6,0x75afdada,0x63422121,0x30201010,0x1ae5ffff,0x0efdf3f3,0x6dbfd2d2,0x4c81cdcd,0x14180c0c,0x35261313,0x2fc3ecec,0xe1be5f5f,0xa2359797,0xcc884444,0x392e1717,0x5793c4c4,0xf255a7a7,0x82fc7e7e,0x477a3d3d,0xacc86464,0xe7ba5d5d,0x2b321919,0x95e67373,0xa0c06060,0x98198181,0xd19e4f4f,0x7fa3dcdc,0x66442222,0x7e542a2a,0xab3b9090,0x830b8888,0xca8c4646,0x29c7eeee,0xd36bb8b8,0x3c281414,0x79a7dede,0xe2bc5e5e,0x1d160b0b,0x76addbdb,0x3bdbe0e0,0x56643232,0x4e743a3a,0x1e140a0a,0xdb924949,0x0a0c0606,0x6c482424,0xe4b85c5c,0x5d9fc2c2,0x6ebdd3d3,0xef43acac,0xa6c46262,0xa8399191,0xa4319595,0x37d3e4e4,0x8bf27979,0x32d5e7e7,0x438bc8c8,0x596e3737,0xb7da6d6d,0x8c018d8d,0x64b1d5d5,0xd29c4e4e,0xe049a9a9,0xb4d86c6c,0xfaac5656,0x07f3f4f4,0x25cfeaea,0xafca6565,0x8ef47a7a,0xe947aeae,0x18100808,0xd56fbaba,0x88f07878,0x6f4a2525,0x725c2e2e,0x24381c1c,0xf157a6a6,0xc773b4b4,0x5197c6c6,0x23cbe8e8,0x7ca1dddd,0x9ce87474,0x213e1f1f,0xdd964b4b,0xdc61bdbd,0x860d8b8b,0x850f8a8a,0x90e07070,0x427c3e3e,0xc471b5b5,0xaacc6666,0xd8904848,0x05060303,0x01f7f6f6,0x121c0e0e,0xa3c26161,0x5f6a3535,0xf9ae5757,0xd069b9b9,0x91178686,0x5899c1c1,0x273a1d1d,0xb9279e9e,0x38d9e1e1,0x13ebf8f8,0xb32b9898,0x33221111,0xbbd26969,0x70a9d9d9,0x89078e8e,0xa7339494,0xb62d9b9b,0x223c1e1e,0x92158787,0x20c9e9e9,0x4987cece,0xffaa5555,0x78502828,0x7aa5dfdf,0x8f038c8c,0xf859a1a1,0x80098989,0x171a0d0d,0xda65bfbf,0x31d7e6e6,0xc6844242,0xb8d06868,0xc3824141,0xb0299999,0x775a2d2d,0x111e0f0f,0xcb7bb0b0,0xfca85454,0xd66dbbbb,0x3a2c1616 };
|
||||
|
|
21
node/AES.hpp
21
node/AES.hpp
|
@ -259,7 +259,10 @@ public:
|
|||
*/
|
||||
ZT_INLINE void init(const uint64_t iv,void *const output) noexcept
|
||||
{
|
||||
// Output buffer to receive the result of AES-CTR encryption.
|
||||
_output = output;
|
||||
|
||||
// Initialize GMAC with 64-bit IV (and remaining 32 bits padded to zero).
|
||||
_iv[0] = iv;
|
||||
_iv[1] = 0;
|
||||
_gmac.init(reinterpret_cast<const uint8_t *>(_iv));
|
||||
|
@ -276,7 +279,11 @@ public:
|
|||
*/
|
||||
ZT_INLINE void aad(const void *const aad,unsigned int len) noexcept
|
||||
{
|
||||
// Feed ADD into GMAC first
|
||||
_gmac.update(aad,len);
|
||||
|
||||
// End of AAD is padded to a multiple of 16 bytes to ensure unique encoding vs. plaintext.
|
||||
// AES-GCM-SIV does this as well for the same reason.
|
||||
len &= 0xfU;
|
||||
if (len != 0)
|
||||
_gmac.update(Utils::ZERO256,16 - len);
|
||||
|
@ -298,11 +305,23 @@ public:
|
|||
*/
|
||||
ZT_INLINE void finish1() noexcept
|
||||
{
|
||||
// Compute GMAC tag, then encrypt the original 64-bit IV and the first 64 bits
|
||||
// of the GMAC tag with AES (single block) and use this to initialize AES-CTR.
|
||||
uint64_t gmacTag[2];
|
||||
_gmac.finish(reinterpret_cast<uint8_t *>(gmacTag));
|
||||
_iv[1] = gmacTag[0];
|
||||
_ctr._aes.encrypt(_iv,_iv);
|
||||
_ctr.init(reinterpret_cast<const uint8_t *>(_iv),_output);
|
||||
|
||||
// Bit 31 of the CTR IV is masked to (1) allow us to optimize by forgetting
|
||||
// about integer overflow for less than 2^31 bytes (which is far less than
|
||||
// this system's max message size), and (2) ensure interoperability with any
|
||||
// future FIPS-compliant or other cryptographic libraries that may or may not
|
||||
// handle 32-bit integer overflow of the least significant 32 bits in the
|
||||
// counter in the expected way.
|
||||
uint64_t ctrIv[2];
|
||||
ctrIv[0] = _iv[0];
|
||||
ctrIv[1] = _iv[1] & ZT_CONST_TO_BE_UINT64(0xffffffff7fffffffULL);
|
||||
_ctr.init(reinterpret_cast<const uint8_t *>(ctrIv),_output);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Add table
Reference in a new issue