From 43e6a9e9ee020a0765a39eb933dface8948d698a Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 5 Sep 2019 09:54:30 -0700 Subject: [PATCH] more fastitude --- node/AES.hpp | 95 +++++++++++++++++++++------------------------------- selftest.cpp | 21 ++++++------ 2 files changed, 48 insertions(+), 68 deletions(-) diff --git a/node/AES.hpp b/node/AES.hpp index 5bd6f64ae..33817018f 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -505,73 +505,54 @@ private: const __m128i k13 = _k.ni.k[13]; const __m128i k14 = _k.ni.k[14]; - while (len >= 64) { +#define ZT_AES_CTR_AESNI_ROUND(k) \ + c0 = _mm_aesenc_si128(c0,k); \ + c1 = _mm_aesenc_si128(c1,k); \ + c2 = _mm_aesenc_si128(c2,k); \ + c3 = _mm_aesenc_si128(c3,k); \ + c4 = _mm_aesenc_si128(c4,k); \ + c5 = _mm_aesenc_si128(c5,k); \ + c6 = _mm_aesenc_si128(c6,k); \ + c7 = _mm_aesenc_si128(c7,k) + + while (len >= 128) { __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),iv0),k0); __m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+1ULL)),iv0),k0); __m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+2ULL)),iv0),k0); __m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+3ULL)),iv0),k0); - ctr += 4; - c0 = _mm_aesenc_si128(c0,k1); - c1 = _mm_aesenc_si128(c1,k1); - c2 = _mm_aesenc_si128(c2,k1); - c3 = _mm_aesenc_si128(c3,k1); - c0 = _mm_aesenc_si128(c0,k2); - c1 = _mm_aesenc_si128(c1,k2); - c2 = _mm_aesenc_si128(c2,k2); - c3 = _mm_aesenc_si128(c3,k2); - c0 = _mm_aesenc_si128(c0,k3); - c1 = _mm_aesenc_si128(c1,k3); - c2 = _mm_aesenc_si128(c2,k3); - c3 = _mm_aesenc_si128(c3,k3); - c0 = _mm_aesenc_si128(c0,k4); - c1 = _mm_aesenc_si128(c1,k4); - c2 = _mm_aesenc_si128(c2,k4); - c3 = _mm_aesenc_si128(c3,k4); - c0 = _mm_aesenc_si128(c0,k5); - c1 = _mm_aesenc_si128(c1,k5); - c2 = _mm_aesenc_si128(c2,k5); - c3 = _mm_aesenc_si128(c3,k5); - c0 = _mm_aesenc_si128(c0,k6); - c1 = _mm_aesenc_si128(c1,k6); - c2 = _mm_aesenc_si128(c2,k6); - c3 = _mm_aesenc_si128(c3,k6); - c0 = _mm_aesenc_si128(c0,k7); - c1 = _mm_aesenc_si128(c1,k7); - c2 = _mm_aesenc_si128(c2,k7); - c3 = _mm_aesenc_si128(c3,k7); - c0 = _mm_aesenc_si128(c0,k8); - c1 = _mm_aesenc_si128(c1,k8); - c2 = _mm_aesenc_si128(c2,k8); - c3 = _mm_aesenc_si128(c3,k8); - c0 = _mm_aesenc_si128(c0,k9); - c1 = _mm_aesenc_si128(c1,k9); - c2 = _mm_aesenc_si128(c2,k9); - c3 = _mm_aesenc_si128(c3,k9); - c0 = _mm_aesenc_si128(c0,k10); - c1 = _mm_aesenc_si128(c1,k10); - c2 = _mm_aesenc_si128(c2,k10); - c3 = _mm_aesenc_si128(c3,k10); - c0 = _mm_aesenc_si128(c0,k11); - c1 = _mm_aesenc_si128(c1,k11); - c2 = _mm_aesenc_si128(c2,k11); - c3 = _mm_aesenc_si128(c3,k11); - c0 = _mm_aesenc_si128(c0,k12); - c1 = _mm_aesenc_si128(c1,k12); - c2 = _mm_aesenc_si128(c2,k12); - c3 = _mm_aesenc_si128(c3,k12); - c0 = _mm_aesenc_si128(c0,k13); - c1 = _mm_aesenc_si128(c1,k13); - c2 = _mm_aesenc_si128(c2,k13); - c3 = _mm_aesenc_si128(c3,k13); + __m128i c4 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+4ULL)),iv0),k0); + __m128i c5 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+5ULL)),iv0),k0); + __m128i c6 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+6ULL)),iv0),k0); + __m128i c7 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+7ULL)),iv0),k0); + ctr += 8; + ZT_AES_CTR_AESNI_ROUND(k1); + ZT_AES_CTR_AESNI_ROUND(k2); + ZT_AES_CTR_AESNI_ROUND(k3); + ZT_AES_CTR_AESNI_ROUND(k4); + ZT_AES_CTR_AESNI_ROUND(k5); + ZT_AES_CTR_AESNI_ROUND(k6); + ZT_AES_CTR_AESNI_ROUND(k7); + ZT_AES_CTR_AESNI_ROUND(k8); + ZT_AES_CTR_AESNI_ROUND(k9); + ZT_AES_CTR_AESNI_ROUND(k10); + ZT_AES_CTR_AESNI_ROUND(k11); + ZT_AES_CTR_AESNI_ROUND(k12); + ZT_AES_CTR_AESNI_ROUND(k13); _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14))); _mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,k14))); _mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,k14))); _mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,k14))); - in += 64; - out += 64; - len -= 64; + _mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,k14))); + _mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,k14))); + _mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,k14))); + _mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,k14))); + in += 128; + out += 128; + len -= 128; } +#undef ZT_AES_CTR_AESNI_ROUND + while (len >= 16) { __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0); c0 = _mm_aesenc_si128(c0,k1); diff --git a/selftest.cpp b/selftest.cpp index fd103e714..484a1713e 100644 --- a/selftest.cpp +++ b/selftest.cpp @@ -207,33 +207,32 @@ static int testCrypto() } std::cout << "OK" ZT_EOL_S << " GMAC-AES-256 (benchmark): "; std::cout.flush(); int64_t start = OSUtils::now(); - for(unsigned long i=0;i<200000;++i) { - tv.gmac((const uint8_t *)buf1,buf1,sizeof(buf1),(uint8_t *)buf1); + for(unsigned long i=0;i<500000;++i) { + tv.gmac((const uint8_t *)buf1,buf1,ZT_DEFAULT_MTU,(uint8_t *)buf1); } int64_t end = OSUtils::now(); *dummy = hexbuf[0]; - std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S; + std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S; std::cout << " AES-256-CTR (benchmark): "; std::cout.flush(); start = OSUtils::now(); - for(unsigned long i=0;i<200000;++i) { - tv.ctr((const uint8_t *)hexbuf,buf1,sizeof(buf1),buf2); - hexbuf[0] = buf2[0]; + for(unsigned long i=0;i<500000;++i) { + tv.ctr((const uint8_t *)hexbuf,buf1,ZT_DEFAULT_MTU,buf1); + *dummy = buf1[0]; } end = OSUtils::now(); - *dummy = buf2[0]; - std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S; + std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S; } { std::cout << " AES-256-GMAC-CTR (benchmark): "; std::cout.flush(); AES k1,k2,k3,k4; AES::initGmacCtrKeys(AES_TEST_VECTOR_0_KEY,k1,k2,k3,k4); int64_t start = OSUtils::now(); - for(unsigned long i=0;i<200000;++i) { - AES::ztGmacCtrEncrypt(k1,k2,k3,k4,(const uint8_t *)hexbuf,buf1,sizeof(buf1),buf1,(uint8_t *)(hexbuf + 8)); + for(unsigned long i=0;i<500000;++i) { + AES::ztGmacCtrEncrypt(k1,k2,k3,k4,(const uint8_t *)hexbuf,buf1,ZT_DEFAULT_MTU,buf1,(uint8_t *)(hexbuf + 8)); *dummy = buf1[0]; } int64_t end = OSUtils::now(); - std::cout << (((double)(200000 * sizeof(buf1)) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" ZT_EOL_S; + std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S; } }