MOAR AES V-TEC

This commit is contained in:
Adam Ierymenko 2020-02-24 13:53:50 -08:00
parent 61b72d42b8
commit 15e88a8b7e
No known key found for this signature in database
GPG key ID: C8877CF2D7A5D7F3

View file

@ -508,6 +508,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
out[totalLen++] = *(in++); out[totalLen++] = *(in++);
if (!(totalLen & 15U)) { if (!(totalLen & 15U)) {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d0 = _mm_xor_si128(d0,k0); d0 = _mm_xor_si128(d0,k0);
d0 = _mm_aesenc_si128(d0,k1); d0 = _mm_aesenc_si128(d0,k1);
d0 = _mm_aesenc_si128(d0,k2); d0 = _mm_aesenc_si128(d0,k2);
@ -526,7 +527,6 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
d0 = _mm_aesenc_si128(d0,k13); d0 = _mm_aesenc_si128(d0,k13);
d0 = _mm_aesenclast_si128(d0,k14); d0 = _mm_aesenclast_si128(d0,k14);
_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0)); _mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
break; break;
} }
} }
@ -535,24 +535,13 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
out += totalLen; out += totalLen;
_len = (totalLen + len); _len = (totalLen + len);
if (likely((c1 + len) > c1)) { // it's incredibly likely that we can ignore carry in counter increment
while (len >= 64) { while (len >= 64) {
__m128i d0,d1,d2,d3; __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
if (likely(c1 < 0xfffffffffffffffcULL)) { __m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0); __m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0); __m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
c1 += 4; c1 += 4;
} else {
d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d1 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d2 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d3 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
if (unlikely(++c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
}
d0 = _mm_xor_si128(d0,k0); d0 = _mm_xor_si128(d0,k0);
d1 = _mm_xor_si128(d1,k0); d1 = _mm_xor_si128(d1,k0);
@ -641,7 +630,7 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
out += 64; out += 64;
} }
{ if (len >= 16) {
const __m128i k7 = k[7]; const __m128i k7 = k[7];
const __m128i k8 = k[8]; const __m128i k8 = k[8];
const __m128i k9 = k[9]; const __m128i k9 = k[9];
@ -650,12 +639,8 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
const __m128i k12 = k[12]; const __m128i k12 = k[12];
const __m128i k13 = k[13]; const __m128i k13 = k[13];
const __m128i k14 = k[14]; const __m128i k14 = k[14];
while (len >= 16) { do {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0); __m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
if (unlikely(c1 == 0)) {
c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
}
d0 = _mm_xor_si128(d0,k0); d0 = _mm_xor_si128(d0,k0);
d0 = _mm_aesenc_si128(d0,k1); d0 = _mm_aesenc_si128(d0,k1);
d0 = _mm_aesenc_si128(d0,k2); d0 = _mm_aesenc_si128(d0,k2);
@ -676,6 +661,139 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
in += 16; in += 16;
len -= 16; len -= 16;
out += 16; out += 16;
} while (len >= 16);
}
} else {
while (len >= 64) {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d0 = _mm_xor_si128(d0,k0);
d1 = _mm_xor_si128(d1,k0);
d2 = _mm_xor_si128(d2,k0);
d3 = _mm_xor_si128(d3,k0);
d0 = _mm_aesenc_si128(d0,k1);
d1 = _mm_aesenc_si128(d1,k1);
d2 = _mm_aesenc_si128(d2,k1);
d3 = _mm_aesenc_si128(d3,k1);
__m128i ka = k[6];
d0 = _mm_aesenc_si128(d0,k2);
d1 = _mm_aesenc_si128(d1,k2);
d2 = _mm_aesenc_si128(d2,k2);
d3 = _mm_aesenc_si128(d3,k2);
__m128i kb = k[7];
d0 = _mm_aesenc_si128(d0,k3);
d1 = _mm_aesenc_si128(d1,k3);
d2 = _mm_aesenc_si128(d2,k3);
d3 = _mm_aesenc_si128(d3,k3);
__m128i kc = k[8];
d0 = _mm_aesenc_si128(d0,k4);
d1 = _mm_aesenc_si128(d1,k4);
d2 = _mm_aesenc_si128(d2,k4);
d3 = _mm_aesenc_si128(d3,k4);
__m128i kd = k[9];
d0 = _mm_aesenc_si128(d0,k5);
d1 = _mm_aesenc_si128(d1,k5);
d2 = _mm_aesenc_si128(d2,k5);
d3 = _mm_aesenc_si128(d3,k5);
__m128i ke = k[10];
d0 = _mm_aesenc_si128(d0,ka);
d1 = _mm_aesenc_si128(d1,ka);
d2 = _mm_aesenc_si128(d2,ka);
d3 = _mm_aesenc_si128(d3,ka);
__m128i kf = k[11];
d0 = _mm_aesenc_si128(d0,kb);
d1 = _mm_aesenc_si128(d1,kb);
d2 = _mm_aesenc_si128(d2,kb);
d3 = _mm_aesenc_si128(d3,kb);
ka = k[12];
d0 = _mm_aesenc_si128(d0,kc);
d1 = _mm_aesenc_si128(d1,kc);
d2 = _mm_aesenc_si128(d2,kc);
d3 = _mm_aesenc_si128(d3,kc);
kb = k[13];
d0 = _mm_aesenc_si128(d0,kd);
d1 = _mm_aesenc_si128(d1,kd);
d2 = _mm_aesenc_si128(d2,kd);
d3 = _mm_aesenc_si128(d3,kd);
kc = k[14];
d0 = _mm_aesenc_si128(d0,ke);
d1 = _mm_aesenc_si128(d1,ke);
d2 = _mm_aesenc_si128(d2,ke);
d3 = _mm_aesenc_si128(d3,ke);
kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
d0 = _mm_aesenc_si128(d0,kf);
d1 = _mm_aesenc_si128(d1,kf);
d2 = _mm_aesenc_si128(d2,kf);
d3 = _mm_aesenc_si128(d3,kf);
ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
d0 = _mm_aesenc_si128(d0,ka);
d1 = _mm_aesenc_si128(d1,ka);
d2 = _mm_aesenc_si128(d2,ka);
d3 = _mm_aesenc_si128(d3,ka);
kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
d0 = _mm_aesenc_si128(d0,kb);
d1 = _mm_aesenc_si128(d1,kb);
d2 = _mm_aesenc_si128(d2,kb);
d3 = _mm_aesenc_si128(d3,kb);
ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
d0 = _mm_aesenclast_si128(d0,kc);
d1 = _mm_aesenclast_si128(d1,kc);
d2 = _mm_aesenclast_si128(d2,kc);
d3 = _mm_aesenclast_si128(d3,kc);
kd = _mm_xor_si128(d0,kd);
ke = _mm_xor_si128(d1,ke);
kf = _mm_xor_si128(d2,kf);
ka = _mm_xor_si128(d3,ka);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
in += 64;
len -= 64;
out += 64;
}
if (len >= 16) {
const __m128i k7 = k[7];
const __m128i k8 = k[8];
const __m128i k9 = k[9];
const __m128i k10 = k[10];
const __m128i k11 = k[11];
const __m128i k12 = k[12];
const __m128i k13 = k[13];
const __m128i k14 = k[14];
do {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
if (unlikely(c1 == 0ULL)) c0 = Utils::hton(Utils::ntoh(c0) + 1ULL);
d0 = _mm_xor_si128(d0,k0);
d0 = _mm_aesenc_si128(d0,k1);
d0 = _mm_aesenc_si128(d0,k2);
d0 = _mm_aesenc_si128(d0,k3);
d0 = _mm_aesenc_si128(d0,k4);
d0 = _mm_aesenc_si128(d0,k5);
d0 = _mm_aesenc_si128(d0,k[6]);
d0 = _mm_aesenc_si128(d0,k7);
d0 = _mm_aesenc_si128(d0,k8);
d0 = _mm_aesenc_si128(d0,k9);
d0 = _mm_aesenc_si128(d0,k10);
d0 = _mm_aesenc_si128(d0,k11);
d0 = _mm_aesenc_si128(d0,k12);
d0 = _mm_aesenc_si128(d0,k13);
d0 = _mm_aesenclast_si128(d0,k14);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
in += 16;
len -= 16;
out += 16;
} while (len >= 16);
} }
} }