From f1b6cb2ace144c71434b22898480a30896f4ba66 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Fri, 7 Aug 2020 17:33:27 +0000 Subject: [PATCH] GMAC is faster in software now. --- core/AES.cpp | 63 +++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/core/AES.cpp b/core/AES.cpp index 743274d6a..33157572f 100644 --- a/core/AES.cpp +++ b/core/AES.cpp @@ -14,6 +14,10 @@ #include "Constants.hpp" #include "AES.hpp" +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + #define Te1_r(x) ZT_ROR32(Te0[x], 8) #define Te2_r(x) ZT_ROR32(Te0[x], 16) #define Te3_r(x) ZT_ROR32(Te0[x], 24) @@ -295,12 +299,22 @@ void AES::GMAC::update(const void *const data, unsigned int len) noexcept } } - while (len >= 16) { - y0 ^= Utils::loadMachineEndian< uint64_t >(in); - y1 ^= Utils::loadMachineEndian< uint64_t >(in + 8); - s_gfmul(h0, h1, y0, y1); - in += 16; - len -= 16; + if (likely(((uintptr_t)in & 7U) == 0U)) { + while (len >= 16) { + y0 ^= *reinterpret_cast(in); + y1 ^= *reinterpret_cast(in + 8); + in += 16; + s_gfmul(h0, h1, y0, y1); + len -= 16; + } + } else { + while (len >= 16) { + y0 ^= Utils::loadMachineEndian< uint64_t >(in); + y1 ^= Utils::loadMachineEndian< uint64_t >(in + 8); + in += 16; + s_gfmul(h0, h1, y0, y1); + len -= 16; + } } _y[0] = y0; @@ -1001,23 +1015,27 @@ void AES::CTR::crypt(const void *const input, unsigned int len) noexcept out += totalLen; _len = (totalLen + len); - { + if (likely(len >= 16)) { const uint32_t *const restrict rk = _aes._k.sw.ek; - const uint32_t ctr0rk0 = Utils::ntoh(reinterpret_cast(_ctr)[0]) ^ rk[0]; - const uint32_t ctr1rk1 = Utils::ntoh(reinterpret_cast(_ctr)[1]) ^ rk[1]; - const uint32_t ctr2rk2 = Utils::ntoh(reinterpret_cast(_ctr)[2]) ^ rk[2]; + const uint32_t ctr0rk0 = Utils::ntoh(reinterpret_cast(_ctr)[0]) ^ rk[0]; + const uint32_t ctr1rk1 = Utils::ntoh(reinterpret_cast(_ctr)[1]) ^ rk[1]; + const uint32_t ctr2rk2 = Utils::ntoh(reinterpret_cast(_ctr)[2]) ^ rk[2]; const uint32_t m8 = 0x000000ff; const uint32_t m8_8 = 0x0000ff00; const uint32_t m8_16 = 0x00ff0000; const uint32_t m8_24 = 0xff000000; - if (likely((((uintptr_t)out & 3U) == 0U) && (((uintptr_t)in & 3U) == 0U))) { - while (len >= 16) { + if (likely((((uintptr_t)out & 7U) == 0U) && (((uintptr_t)in & 7U) == 0U))) { + do { uint32_t s0, s1, s2, s3, t0, t1, t2, t3; s0 = ctr0rk0; s1 = ctr1rk1; s2 = ctr2rk2; s3 = ctr++ ^ rk[3]; + const uint64_t in0 = *reinterpret_cast(in); + const uint64_t in1 = *reinterpret_cast(in + 8); + in += 16; + t0 = Te0[s0 >> 24U] ^ Te1_r((s1 >> 16U) & m8) ^ Te2_r((s2 >> 8U) & m8) ^ Te3_r(s3 & m8) ^ rk[4]; t1 = Te0[s1 >> 24U] ^ Te1_r((s2 >> 16U) & m8) ^ Te2_r((s3 >> 8U) & m8) ^ Te3_r(s0 & m8) ^ rk[5]; t2 = Te0[s2 >> 24U] ^ Te1_r((s3 >> 16U) & m8) ^ Te2_r((s0 >> 8U) & m8) ^ Te3_r(s1 & m8) ^ rk[6]; @@ -1075,21 +1093,12 @@ void AES::CTR::crypt(const void *const input, unsigned int len) noexcept s2 = (Te2_r(t2 >> 24U) & m8_24) ^ (Te3_r((t3 >> 16U) & m8) & m8_16) ^ (Te0[(t0 >> 8U) & m8] & m8_8) ^ (Te1_r(t1 & m8) & m8) ^ rk[58]; s3 = (Te2_r(t3 >> 24U) & m8_24) ^ (Te3_r((t0 >> 16U) & m8) & m8_16) ^ (Te0[(t1 >> 8U) & m8] & m8_8) ^ (Te1_r(t2 & m8) & m8) ^ rk[59]; - s0 = Utils::hton(s0) ^ *reinterpret_cast(in); - s1 = Utils::hton(s1) ^ *reinterpret_cast(in + 4); - s2 = Utils::hton(s2) ^ *reinterpret_cast(in + 8); - s3 = Utils::hton(s3) ^ *reinterpret_cast(in + 12); - *reinterpret_cast(out) = s0; - *reinterpret_cast(out + 4) = s1; - *reinterpret_cast(out + 8) = s2; - *reinterpret_cast(out + 12) = s3; - + *reinterpret_cast(out) = in0 ^ Utils::hton(((uint64_t)s0 << 32U) | (uint64_t)s1); + *reinterpret_cast(out + 8) = in1 ^ Utils::hton(((uint64_t)s2 << 32U) | (uint64_t)s3); out += 16; - len -= 16; - in += 16; - } + } while ((len -= 16) >= 16); } else { - while (len >= 16) { + do { uint32_t s0, s1, s2, s3, t0, t1, t2, t3; s0 = ctr0rk0; s1 = ctr1rk1; @@ -1169,11 +1178,9 @@ void AES::CTR::crypt(const void *const input, unsigned int len) noexcept out[13] = in[13] ^ (uint8_t)(s3 >> 16U); out[14] = in[14] ^ (uint8_t)(s3 >> 8U); out[15] = in[15] ^ (uint8_t)s3; - out += 16; - len -= 16; in += 16; - } + } while ((len -= 16) >= 16); } reinterpret_cast(_ctr)[3] = Utils::hton(ctr); }