mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-06-05 03:53:44 +02:00
Fix code generation for AES stuff so it runs on chips that do not have AVX or VAES.
This commit is contained in:
parent
aad21cf395
commit
b99dd7d916
2 changed files with 238 additions and 225 deletions
|
@ -96,7 +96,7 @@ if (
|
|||
CMAKE_SYSTEM_PROCESSOR MATCHES "amd64"
|
||||
)
|
||||
message("++ Adding SSE and AES-NI flags for processor ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
add_compile_options(-maes -mrdrnd -mpclmul -msse -msse2 -mssse3 -msse4 -mavx -mavx2 -mavx512f -mvaes)
|
||||
add_compile_options(-maes -mrdrnd -mpclmul -msse -msse2 -mssse3)
|
||||
endif()
|
||||
|
||||
add_subdirectory(node)
|
||||
|
|
215
node/AES.cpp
215
node/AES.cpp
|
@ -468,60 +468,55 @@ void AES::GMAC::finish(uint8_t tag[16]) noexcept
|
|||
|
||||
// AES-CTR ------------------------------------------------------------------------------------------------------------
|
||||
|
||||
void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
||||
static __attribute__((__target__("sse4,avx,avx2,vaes,avx512f"))) void p_aesCtrInnerVAES512(unsigned int &len,uint64_t &c0,uint64_t &c1,const uint8_t *&in,uint8_t *&out,const __m128i *const k) noexcept
|
||||
{
|
||||
const uint8_t *in = reinterpret_cast<const uint8_t *>(input);
|
||||
uint8_t *out = _out;
|
||||
const __m512i kk0 = _mm512_broadcast_i32x4(k[0]);
|
||||
const __m512i kk1 = _mm512_broadcast_i32x4(k[1]);
|
||||
const __m512i kk2 = _mm512_broadcast_i32x4(k[2]);
|
||||
const __m512i kk3 = _mm512_broadcast_i32x4(k[3]);
|
||||
const __m512i kk4 = _mm512_broadcast_i32x4(k[4]);
|
||||
const __m512i kk5 = _mm512_broadcast_i32x4(k[5]);
|
||||
const __m512i kk6 = _mm512_broadcast_i32x4(k[6]);
|
||||
const __m512i kk7 = _mm512_broadcast_i32x4(k[7]);
|
||||
const __m512i kk8 = _mm512_broadcast_i32x4(k[8]);
|
||||
const __m512i kk9 = _mm512_broadcast_i32x4(k[9]);
|
||||
const __m512i kk10 = _mm512_broadcast_i32x4(k[10]);
|
||||
const __m512i kk11 = _mm512_broadcast_i32x4(k[11]);
|
||||
const __m512i kk12 = _mm512_broadcast_i32x4(k[12]);
|
||||
const __m512i kk13 = _mm512_broadcast_i32x4(k[13]);
|
||||
const __m512i kk14 = _mm512_broadcast_i32x4(k[14]);
|
||||
do {
|
||||
__m512i d0 = _mm512_set_epi64(
|
||||
(long long)Utils::hton(c1 + 3ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 2ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 1ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1),(long long)c0);
|
||||
c1 += 4;
|
||||
__m512i p0 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(in));
|
||||
in += 64;
|
||||
d0 = _mm512_xor_si512(d0,kk0);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk1);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk2);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk3);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk4);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk5);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk6);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk7);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk8);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk9);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk10);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk11);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk12);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk13);
|
||||
d0 = _mm512_aesenclast_epi128(d0,kk14);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i *>(out),_mm512_xor_si512(p0,d0));
|
||||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
}
|
||||
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (likely(Utils::CPUID.aes)) {
|
||||
uint64_t c0 = _ctr[0];
|
||||
uint64_t c1 = Utils::ntoh(_ctr[1]);
|
||||
const __m128i *const k = _aes._k.ni.k;
|
||||
|
||||
// Complete any unfinished blocks from previous calls to crypt().
|
||||
unsigned int totalLen = _len;
|
||||
if ((totalLen & 15U)) {
|
||||
for (;;) {
|
||||
if (unlikely(!len)) {
|
||||
_ctr[0] = c0;
|
||||
_ctr[1] = Utils::hton(c1);
|
||||
_len = totalLen;
|
||||
return;
|
||||
}
|
||||
--len;
|
||||
out[totalLen++] = *(in++);
|
||||
if (!(totalLen & 15U)) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
d0 = _mm_xor_si128(d0,k[0]);
|
||||
d0 = _mm_aesenc_si128(d0,k[1]);
|
||||
d0 = _mm_aesenc_si128(d0,k[2]);
|
||||
d0 = _mm_aesenc_si128(d0,k[3]);
|
||||
d0 = _mm_aesenc_si128(d0,k[4]);
|
||||
d0 = _mm_aesenc_si128(d0,k[5]);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k[7]);
|
||||
d0 = _mm_aesenc_si128(d0,k[8]);
|
||||
d0 = _mm_aesenc_si128(d0,k[9]);
|
||||
d0 = _mm_aesenc_si128(d0,k[10]);
|
||||
__m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16));
|
||||
d0 = _mm_aesenc_si128(d0,k[11]);
|
||||
const __m128i p0 = _mm_loadu_si128(outblk);
|
||||
d0 = _mm_aesenc_si128(d0,k[12]);
|
||||
d0 = _mm_aesenc_si128(d0,k[13]);
|
||||
d0 = _mm_aesenclast_si128(d0,k[14]);
|
||||
_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out += totalLen;
|
||||
_len = totalLen + len;
|
||||
|
||||
if (likely(len >= 64)) {
|
||||
if (Utils::CPUID.vaes) { // is only true if AVX is also present
|
||||
if ((!Utils::CPUID.avx512f)||((len < 1024))) {
|
||||
static __attribute__((__target__("sse4,avx,avx2,vaes"))) void p_aesCtrInnerVAES256(unsigned int &len,uint64_t &c0,uint64_t &c1,const uint8_t *&in,uint8_t *&out,const __m128i *const k) noexcept
|
||||
{
|
||||
const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]);
|
||||
const __m256i kk1 = _mm256_broadcastsi128_si256(k[1]);
|
||||
const __m256i kk2 = _mm256_broadcastsi128_si256(k[2]);
|
||||
|
@ -583,52 +578,10 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
} else {
|
||||
const __m512i kk0 = _mm512_broadcast_i32x4(k[0]);
|
||||
const __m512i kk1 = _mm512_broadcast_i32x4(k[1]);
|
||||
const __m512i kk2 = _mm512_broadcast_i32x4(k[2]);
|
||||
const __m512i kk3 = _mm512_broadcast_i32x4(k[3]);
|
||||
const __m512i kk4 = _mm512_broadcast_i32x4(k[4]);
|
||||
const __m512i kk5 = _mm512_broadcast_i32x4(k[5]);
|
||||
const __m512i kk6 = _mm512_broadcast_i32x4(k[6]);
|
||||
const __m512i kk7 = _mm512_broadcast_i32x4(k[7]);
|
||||
const __m512i kk8 = _mm512_broadcast_i32x4(k[8]);
|
||||
const __m512i kk9 = _mm512_broadcast_i32x4(k[9]);
|
||||
const __m512i kk10 = _mm512_broadcast_i32x4(k[10]);
|
||||
const __m512i kk11 = _mm512_broadcast_i32x4(k[11]);
|
||||
const __m512i kk12 = _mm512_broadcast_i32x4(k[12]);
|
||||
const __m512i kk13 = _mm512_broadcast_i32x4(k[13]);
|
||||
const __m512i kk14 = _mm512_broadcast_i32x4(k[14]);
|
||||
do {
|
||||
__m512i d0 = _mm512_set_epi64(
|
||||
(long long)Utils::hton(c1 + 3ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 2ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 1ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1),(long long)c0);
|
||||
c1 += 4;
|
||||
__m512i p0 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(in));
|
||||
in += 64;
|
||||
d0 = _mm512_xor_si512(d0,kk0);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk1);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk2);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk3);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk4);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk5);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk6);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk7);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk8);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk9);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk10);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk11);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk12);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk13);
|
||||
d0 = _mm512_aesenclast_epi128(d0,kk14);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i *>(out),_mm512_xor_si512(p0,d0));
|
||||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
static void p_aesCtrInner128(unsigned int &len,uint64_t &c0,uint64_t &c1,const uint8_t *&in,uint8_t *&out,const __m128i *const k) noexcept
|
||||
{
|
||||
const __m128i k0 = k[0];
|
||||
const __m128i k1 = k[1];
|
||||
const __m128i k2 = k[2];
|
||||
|
@ -726,11 +679,72 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
}
|
||||
|
||||
void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
||||
{
|
||||
const uint8_t *in = reinterpret_cast<const uint8_t *>(input);
|
||||
uint8_t *out = _out;
|
||||
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (likely(Utils::CPUID.aes)) {
|
||||
uint64_t c0 = _ctr[0];
|
||||
uint64_t c1 = Utils::ntoh(_ctr[1]);
|
||||
const __m128i *const k = _aes._k.ni.k;
|
||||
|
||||
// Complete any unfinished blocks from previous calls to crypt().
|
||||
unsigned int totalLen = _len;
|
||||
if ((totalLen & 15U)) {
|
||||
for (;;) {
|
||||
if (unlikely(!len)) {
|
||||
_ctr[0] = c0;
|
||||
_ctr[1] = Utils::hton(c1);
|
||||
_len = totalLen;
|
||||
return;
|
||||
}
|
||||
--len;
|
||||
out[totalLen++] = *(in++);
|
||||
if (!(totalLen & 15U)) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
d0 = _mm_xor_si128(d0,k[0]);
|
||||
d0 = _mm_aesenc_si128(d0,k[1]);
|
||||
d0 = _mm_aesenc_si128(d0,k[2]);
|
||||
d0 = _mm_aesenc_si128(d0,k[3]);
|
||||
d0 = _mm_aesenc_si128(d0,k[4]);
|
||||
d0 = _mm_aesenc_si128(d0,k[5]);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k[7]);
|
||||
d0 = _mm_aesenc_si128(d0,k[8]);
|
||||
d0 = _mm_aesenc_si128(d0,k[9]);
|
||||
d0 = _mm_aesenc_si128(d0,k[10]);
|
||||
__m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16));
|
||||
d0 = _mm_aesenc_si128(d0,k[11]);
|
||||
const __m128i p0 = _mm_loadu_si128(outblk);
|
||||
d0 = _mm_aesenc_si128(d0,k[12]);
|
||||
d0 = _mm_aesenc_si128(d0,k[13]);
|
||||
d0 = _mm_aesenclast_si128(d0,k[14]);
|
||||
_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (len >= 16) {
|
||||
do {
|
||||
out += totalLen;
|
||||
_len = totalLen + len;
|
||||
|
||||
if (likely(len >= 64)) {
|
||||
if (Utils::CPUID.vaes) { // is only true if AVX is also present
|
||||
if ((!Utils::CPUID.avx512f)||((len < 1024))) {
|
||||
p_aesCtrInnerVAES256(len,c0,c1,in,out,k);
|
||||
} else {
|
||||
p_aesCtrInnerVAES512(len,c0,c1,in,out,k);
|
||||
}
|
||||
} else {
|
||||
p_aesCtrInner128(len,c0,c1,in,out,k);
|
||||
}
|
||||
}
|
||||
|
||||
while (len >= 16) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
d0 = _mm_xor_si128(d0,k[0]);
|
||||
d0 = _mm_aesenc_si128(d0,k[1]);
|
||||
|
@ -751,7 +765,6 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
in += 16;
|
||||
len -= 16;
|
||||
out += 16;
|
||||
} while (len >= 16);
|
||||
}
|
||||
|
||||
// Any remaining input is placed in _out. This will be picked up and crypted
|
||||
|
|
Loading…
Add table
Reference in a new issue