mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-04-19 05:26:54 +02:00
AES optimization rabbit hole
This commit is contained in:
parent
2d1eeda188
commit
87fe69c27c
5 changed files with 215 additions and 134 deletions
190
node/AES-aesni.c
Normal file
190
node/AES-aesni.c
Normal file
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
* Copyright (c)2019 ZeroTier, Inc.
|
||||
*
|
||||
* Use of this software is governed by the Business Source License included
|
||||
* in the LICENSE.TXT file in the project's root directory.
|
||||
*
|
||||
* Change Date: 2023-01-01
|
||||
*
|
||||
* On the date above, in accordance with the Business Source License, use
|
||||
* of this software will be governed by version 2.0 of the Apache License.
|
||||
*/
|
||||
/****/
|
||||
|
||||
#if (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64))
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
/* #define register */
|
||||
|
||||
void zt_crypt_ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out)
|
||||
{
|
||||
_mm_prefetch(in,_MM_HINT_NTA);
|
||||
|
||||
/* Because our CTR supports full 128-bit nonces, we must do a full 128-bit (big-endian)
|
||||
* increment to be compatible with canonical NIST-certified CTR implementations. That's
|
||||
* because it's possible to have a lot of bit saturation in the least significant 64
|
||||
* bits, which could on rare occasions actually cause a 64-bit wrap. If this happened
|
||||
* without carry it would result in incompatibility and quietly dropped packets. The
|
||||
* probability is low, so this would be a one in billions packet loss bug that would
|
||||
* probably never be found.
|
||||
*
|
||||
* This crazy code does a branch-free 128-bit increment by adding a one or a zero to
|
||||
* the most significant 64 bits of the 128-bit vector based on whether the add we want
|
||||
* to do to the least significant 64 bits would overflow. This can be computed by
|
||||
* NOTing those bits and comparing with what we want to add, since NOT is the same
|
||||
* as subtracting from uint64_max. This generates branch-free ASM on x64 with most
|
||||
* good compilers. */
|
||||
register __m128i swap128 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
||||
register __m128i ctr0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)iv),swap128);
|
||||
register uint64_t notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0));
|
||||
register __m128i ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 1ULL),1LL)),swap128);
|
||||
register __m128i ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 2ULL),2LL)),swap128);
|
||||
register __m128i ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 3ULL),3LL)),swap128);
|
||||
ctr0 = _mm_shuffle_epi8(ctr0,swap128);
|
||||
|
||||
while (len >= 64) {
|
||||
_mm_prefetch(in + 64,_MM_HINT_NTA);
|
||||
register __m128i ka = key[0];
|
||||
register __m128i c0 = _mm_xor_si128(ctr0,ka);
|
||||
ctr0 = _mm_shuffle_epi8(ctr0,swap128);
|
||||
notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0));
|
||||
register __m128i c1 = _mm_xor_si128(ctr1,ka);
|
||||
register __m128i c2 = _mm_xor_si128(ctr2,ka);
|
||||
register __m128i c3 = _mm_xor_si128(ctr3,ka);
|
||||
register __m128i kb = key[1];
|
||||
ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 5ULL),5LL)),swap128);
|
||||
ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 6ULL),6LL)),swap128);
|
||||
register __m128i kc = key[2];
|
||||
ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 7ULL),7LL)),swap128);
|
||||
ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 4ULL),4LL)),swap128);
|
||||
register __m128i kd = key[3];
|
||||
#define ZT_AES_CTR_AESNI_ROUND(kk) \
|
||||
c0 = _mm_aesenc_si128(c0,kk); \
|
||||
c1 = _mm_aesenc_si128(c1,kk); \
|
||||
c2 = _mm_aesenc_si128(c2,kk); \
|
||||
c3 = _mm_aesenc_si128(c3,kk);
|
||||
ka = key[4];
|
||||
ZT_AES_CTR_AESNI_ROUND(kb);
|
||||
kb = key[5];
|
||||
ZT_AES_CTR_AESNI_ROUND(kc);
|
||||
kc = key[6];
|
||||
ZT_AES_CTR_AESNI_ROUND(kd);
|
||||
kd = key[7];
|
||||
ZT_AES_CTR_AESNI_ROUND(ka);
|
||||
ka = key[8];
|
||||
ZT_AES_CTR_AESNI_ROUND(kb);
|
||||
kb = key[9];
|
||||
ZT_AES_CTR_AESNI_ROUND(kc);
|
||||
kc = key[10];
|
||||
ZT_AES_CTR_AESNI_ROUND(kd);
|
||||
kd = key[11];
|
||||
ZT_AES_CTR_AESNI_ROUND(ka);
|
||||
ka = key[12];
|
||||
ZT_AES_CTR_AESNI_ROUND(kb);
|
||||
kb = key[13];
|
||||
ZT_AES_CTR_AESNI_ROUND(kc);
|
||||
kc = key[14];
|
||||
ZT_AES_CTR_AESNI_ROUND(kd);
|
||||
ZT_AES_CTR_AESNI_ROUND(ka);
|
||||
ZT_AES_CTR_AESNI_ROUND(kb);
|
||||
#undef ZT_AES_CTR_AESNI_ROUND
|
||||
register __m128i d0 = _mm_loadu_si128((const __m128i *)in);
|
||||
register __m128i d1 = _mm_loadu_si128((const __m128i *)(in + 16));
|
||||
register __m128i d2 = _mm_loadu_si128((const __m128i *)(in + 32));
|
||||
register __m128i d3 = _mm_loadu_si128((const __m128i *)(in + 48));
|
||||
c0 = _mm_aesenclast_si128(c0,kc);
|
||||
c1 = _mm_aesenclast_si128(c1,kc);
|
||||
c2 = _mm_aesenclast_si128(c2,kc);
|
||||
c3 = _mm_aesenclast_si128(c3,kc);
|
||||
d0 = _mm_xor_si128(d0,c0);
|
||||
d1 = _mm_xor_si128(d1,c1);
|
||||
d2 = _mm_xor_si128(d2,c2);
|
||||
d3 = _mm_xor_si128(d3,c3);
|
||||
_mm_storeu_si128((__m128i *)out,d0);
|
||||
_mm_storeu_si128((__m128i *)(out + 16),d1);
|
||||
_mm_storeu_si128((__m128i *)(out + 32),d2);
|
||||
_mm_storeu_si128((__m128i *)(out + 48),d3);
|
||||
in += 64;
|
||||
out += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
register __m128i k0 = key[0];
|
||||
register __m128i k1 = key[1];
|
||||
register __m128i k2 = key[2];
|
||||
register __m128i k3 = key[3];
|
||||
register __m128i k4 = key[4];
|
||||
register __m128i k5 = key[5];
|
||||
register __m128i k6 = key[6];
|
||||
register __m128i k7 = key[7];
|
||||
/* not enough XMM registers for all of them, but it helps slightly... */
|
||||
|
||||
while (len >= 16) {
|
||||
register __m128i c0 = _mm_xor_si128(ctr0,k0);
|
||||
ctr0 = _mm_shuffle_epi8(ctr0,swap128);
|
||||
ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 1ULL),1LL)),swap128);
|
||||
c0 = _mm_aesenc_si128(c0,k1);
|
||||
c0 = _mm_aesenc_si128(c0,k2);
|
||||
c0 = _mm_aesenc_si128(c0,k3);
|
||||
c0 = _mm_aesenc_si128(c0,k4);
|
||||
c0 = _mm_aesenc_si128(c0,k5);
|
||||
c0 = _mm_aesenc_si128(c0,k6);
|
||||
register __m128i ka = key[8];
|
||||
c0 = _mm_aesenc_si128(c0,k7);
|
||||
register __m128i kb = key[9];
|
||||
c0 = _mm_aesenc_si128(c0,ka);
|
||||
ka = key[10];
|
||||
c0 = _mm_aesenc_si128(c0,kb);
|
||||
kb = key[11];
|
||||
c0 = _mm_aesenc_si128(c0,ka);
|
||||
ka = key[12];
|
||||
c0 = _mm_aesenc_si128(c0,kb);
|
||||
kb = key[13];
|
||||
c0 = _mm_aesenc_si128(c0,ka);
|
||||
ka = key[14];
|
||||
c0 = _mm_aesenc_si128(c0,kb);
|
||||
_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,ka)));
|
||||
in += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
register __m128i c0 = _mm_xor_si128(ctr0,k0);
|
||||
k0 = key[8];
|
||||
c0 = _mm_aesenc_si128(c0,k1);
|
||||
c0 = _mm_aesenc_si128(c0,k2);
|
||||
k1 = key[9];
|
||||
c0 = _mm_aesenc_si128(c0,k3);
|
||||
c0 = _mm_aesenc_si128(c0,k4);
|
||||
k2 = key[10];
|
||||
c0 = _mm_aesenc_si128(c0,k5);
|
||||
c0 = _mm_aesenc_si128(c0,k6);
|
||||
k3 = key[11];
|
||||
c0 = _mm_aesenc_si128(c0,k7);
|
||||
c0 = _mm_aesenc_si128(c0,k0);
|
||||
k0 = key[12];
|
||||
c0 = _mm_aesenc_si128(c0,k1);
|
||||
c0 = _mm_aesenc_si128(c0,k2);
|
||||
k1 = key[13];
|
||||
c0 = _mm_aesenc_si128(c0,k3);
|
||||
c0 = _mm_aesenc_si128(c0,k0);
|
||||
k2 = key[14];
|
||||
c0 = _mm_aesenc_si128(c0,k1);
|
||||
c0 = _mm_aesenclast_si128(c0,k2);
|
||||
uint8_t tmp[16];
|
||||
_mm_storeu_si128((__m128i *)tmp,c0);
|
||||
for(unsigned int i=0;i<len;++i)
|
||||
out[i] = in[i] ^ tmp[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
117
node/AES.cpp
117
node/AES.cpp
|
@ -372,121 +372,4 @@ void AES::_gmacSW(const uint8_t iv[12],const uint8_t *in,unsigned int len,uint8_
|
|||
#endif
|
||||
}
|
||||
|
||||
#ifdef ZT_AES_AESNI
|
||||
|
||||
void AES::_crypt_ctr_aesni(const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) const
|
||||
{
|
||||
__m128i ctr0,ctr1,ctr2,ctr3,ctr4,ctr5,ctr6,ctr7;
|
||||
__m128i swap128 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
||||
ctr0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)iv),swap128);
|
||||
ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 1ULL),1LL)),swap128);
|
||||
ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 2ULL),2LL)),swap128);
|
||||
ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 3ULL),3LL)),swap128);
|
||||
ctr4 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 4ULL),4LL)),swap128);
|
||||
ctr5 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 5ULL),5LL)),swap128);
|
||||
ctr6 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 6ULL),6LL)),swap128);
|
||||
ctr7 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 7ULL),7LL)),swap128);
|
||||
ctr0 = _mm_shuffle_epi8(ctr0,swap128);
|
||||
uint64_t ctr = 8;
|
||||
|
||||
#define ZT_AES_CTR_AESNI_ROUND(k) \
|
||||
c0 = _mm_aesenc_si128(c0,k); \
|
||||
c1 = _mm_aesenc_si128(c1,k); \
|
||||
c2 = _mm_aesenc_si128(c2,k); \
|
||||
c3 = _mm_aesenc_si128(c3,k); \
|
||||
c4 = _mm_aesenc_si128(c4,k); \
|
||||
c5 = _mm_aesenc_si128(c5,k); \
|
||||
c6 = _mm_aesenc_si128(c6,k); \
|
||||
c7 = _mm_aesenc_si128(c7,k)
|
||||
while (len >= 128) {
|
||||
__m128i c0 = _mm_xor_si128(ctr0,_k.ni.k[0]);
|
||||
ctr0 = _mm_shuffle_epi8(ctr0,swap128);
|
||||
__m128i c1 = _mm_xor_si128(ctr1,_k.ni.k[0]);
|
||||
ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 1ULL)),(long long)(ctr + 1ULL))),swap128);
|
||||
__m128i c2 = _mm_xor_si128(ctr2,_k.ni.k[0]);
|
||||
ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 2ULL)),(long long)(ctr + 2ULL))),swap128);
|
||||
__m128i c3 = _mm_xor_si128(ctr3,_k.ni.k[0]);
|
||||
ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 3ULL)),(long long)(ctr + 3ULL))),swap128);
|
||||
__m128i c4 = _mm_xor_si128(ctr4,_k.ni.k[0]);
|
||||
ctr4 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 4ULL)),(long long)(ctr + 4ULL))),swap128);
|
||||
__m128i c5 = _mm_xor_si128(ctr5,_k.ni.k[0]);
|
||||
ctr5 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 5ULL)),(long long)(ctr + 5ULL))),swap128);
|
||||
__m128i c6 = _mm_xor_si128(ctr6,_k.ni.k[0]);
|
||||
ctr6 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 6ULL)),(long long)(ctr + 6ULL))),swap128);
|
||||
__m128i c7 = _mm_xor_si128(ctr7,_k.ni.k[0]);
|
||||
ctr7 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 7ULL)),(long long)(ctr + 7ULL))),swap128);
|
||||
ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr)),(long long)(ctr))),swap128);
|
||||
ctr += 8;
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[3]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[4]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[5]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[6]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[7]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[8]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[9]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[10]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[11]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[12]);
|
||||
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[13]);
|
||||
_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,_k.ni.k[14])));
|
||||
_mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,_k.ni.k[14])));
|
||||
in += 128;
|
||||
out += 128;
|
||||
len -= 128;
|
||||
}
|
||||
#undef ZT_AES_CTR_AESNI_ROUND
|
||||
|
||||
while (len >= 16) {
|
||||
__m128i c0 = _mm_xor_si128(ctr0,_k.ni.k[0]);
|
||||
ctr0 = _mm_shuffle_epi8(ctr0,swap128);
|
||||
ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr)),(long long)(ctr))),swap128);
|
||||
++ctr;
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[1]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[2]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[3]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[4]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[5]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[6]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[7]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[8]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[9]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[10]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[11]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[12]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[13]);
|
||||
_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14])));
|
||||
in += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
__m128i c0 = _mm_xor_si128(ctr0,_k.ni.k[0]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[1]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[2]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[3]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[4]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[5]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[6]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[7]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[8]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[9]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[10]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[11]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[12]);
|
||||
c0 = _mm_aesenc_si128(c0,_k.ni.k[13]);
|
||||
c0 = _mm_aesenclast_si128(c0,_k.ni.k[14]);
|
||||
for(unsigned int i=0;i<len;++i)
|
||||
out[i] = in[i] ^ ((const uint8_t *)&c0)[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ZT_AES_AESNI
|
||||
|
||||
} // namespace ZeroTier
|
||||
|
|
12
node/AES.hpp
12
node/AES.hpp
|
@ -19,12 +19,18 @@
|
|||
#include "SHA512.hpp"
|
||||
|
||||
#if (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64))
|
||||
|
||||
#include <wmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define ZT_AES_AESNI 1
|
||||
#endif
|
||||
|
||||
// AES-aesni.c
|
||||
extern "C" void zt_crypt_ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out);
|
||||
|
||||
#endif // x64
|
||||
|
||||
#define ZT_AES_KEY_SIZE 32
|
||||
#define ZT_AES_BLOCK_SIZE 16
|
||||
|
@ -115,7 +121,7 @@ public:
|
|||
{
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (likely(HW_ACCEL)) {
|
||||
_crypt_ctr_aesni(iv,(const uint8_t *)in,len,(uint8_t *)out);
|
||||
zt_crypt_ctr_aesni(_k.ni.k,iv,(const uint8_t *)in,len,(uint8_t *)out);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -524,8 +530,6 @@ private:
|
|||
_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
|
||||
}
|
||||
|
||||
void _crypt_ctr_aesni(const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) const;
|
||||
|
||||
static ZT_ALWAYS_INLINE __m128i _mult_block_aesni(__m128i shuf,__m128i h,__m128i y)
|
||||
{
|
||||
y = _mm_shuffle_epi8(y,shuf);
|
||||
|
|
|
@ -48,8 +48,10 @@ set(core_headers
|
|||
Trace.hpp
|
||||
Utils.hpp
|
||||
)
|
||||
|
||||
set(core_src
|
||||
AES.cpp
|
||||
AES-aesni.c
|
||||
C25519.cpp
|
||||
Credential.cpp
|
||||
ECC384.cpp
|
||||
|
@ -72,18 +74,20 @@ set(core_src
|
|||
Trace.cpp
|
||||
Utils.cpp
|
||||
)
|
||||
|
||||
add_library(${PROJECT_NAME} STATIC ${core_src} ${core_headers})
|
||||
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
|
||||
target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_BINARY_DIR})
|
||||
|
||||
if(UNIX)
|
||||
set_source_files_properties(
|
||||
AES.cpp
|
||||
ECC384.cpp
|
||||
Salsa20.cpp
|
||||
C25519.cpp
|
||||
Poly1305.cpp
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS "-Wall -O3"
|
||||
)
|
||||
endif(UNIX)
|
||||
#if(UNIX)
|
||||
# set_source_files_properties(
|
||||
# AES.cpp
|
||||
# AES-aesni.c
|
||||
# ECC384.cpp
|
||||
# Salsa20.cpp
|
||||
# C25519.cpp
|
||||
# Poly1305.cpp
|
||||
# PROPERTIES
|
||||
# COMPILE_FLAGS "-Wall -O3"
|
||||
# )
|
||||
#endif(UNIX)
|
||||
|
|
|
@ -214,12 +214,12 @@ static int testCrypto()
|
|||
std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S;
|
||||
std::cout << " AES-256-CTR (benchmark): "; std::cout.flush();
|
||||
start = OSUtils::now();
|
||||
for(unsigned long i=0;i<500000;++i) {
|
||||
for(unsigned long i=0;i<1000000;++i) {
|
||||
tv.ctr((const uint8_t *)hexbuf,buf1,ZT_DEFAULT_MTU,buf1);
|
||||
*dummy = buf1[0];
|
||||
}
|
||||
end = OSUtils::now();
|
||||
std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S;
|
||||
std::cout << (((1000000.0 * (double)ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S;
|
||||
}
|
||||
{
|
||||
std::cout << " AES-256-GMAC-SIV (benchmark): "; std::cout.flush();
|
||||
|
|
Loading…
Add table
Reference in a new issue