Blazing fast VAES (256 and 512) AES-CTR, Identity fixes, test fixes.

This commit is contained in:
Adam Ierymenko 2020-05-18 16:29:41 -07:00
parent d537428421
commit aad21cf395
No known key found for this signature in database
GPG key ID: C8877CF2D7A5D7F3
7 changed files with 419 additions and 311 deletions

View file

@ -96,7 +96,7 @@ if (
CMAKE_SYSTEM_PROCESSOR MATCHES "amd64"
)
message("++ Adding SSE and AES-NI flags for processor ${CMAKE_SYSTEM_PROCESSOR}")
add_compile_options(-maes -mrdrnd -mpclmul -msse -msse2 -mssse3)
add_compile_options(-maes -mrdrnd -mpclmul -msse -msse2 -mssse3 -msse4 -mavx -mavx2 -mavx512f -mvaes)
endif()
add_subdirectory(node)

View file

@ -14,8 +14,6 @@
#include "Constants.hpp"
#include "AES.hpp"
#include <cstdio>
namespace ZeroTier {
// GMAC ---------------------------------------------------------------------------------------------------------------
@ -191,90 +189,93 @@ void AES::GMAC::update(const void *const data,unsigned int len) noexcept
}
}
while (len >= 64) {
__m128i d1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
__m128i d2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
__m128i d3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
__m128i d4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
in += 64;
len -= 64;
// This does 4X parallel mult_block via instruction level parallelism.
d1 = _mm_shuffle_epi8(_mm_xor_si128(y,d1),shuf);
d2 = _mm_shuffle_epi8(d2,shuf);
d3 = _mm_shuffle_epi8(d3,shuf);
d4 = _mm_shuffle_epi8(d4,shuf);
__m128i t0 = _mm_clmulepi64_si128(_aes._k.ni.hhhh,d1,0x00);
__m128i t1 = _mm_clmulepi64_si128(_aes._k.ni.hhh,d2,0x00);
__m128i t2 = _mm_clmulepi64_si128(_aes._k.ni.hh,d3,0x00);
__m128i t3 = _mm_clmulepi64_si128(_aes._k.ni.h,d4,0x00);
__m128i t8 = _mm_xor_si128(t0,t1);
t8 = _mm_xor_si128(t8,t2);
t8 = _mm_xor_si128(t8,t3);
__m128i t4 = _mm_clmulepi64_si128(_aes._k.ni.hhhh,d1,0x11);
__m128i t5 = _mm_clmulepi64_si128(_aes._k.ni.hhh,d2,0x11);
__m128i t6 = _mm_clmulepi64_si128(_aes._k.ni.hh,d3,0x11);
__m128i t7 = _mm_clmulepi64_si128(_aes._k.ni.h,d4,0x11);
__m128i t9 = _mm_xor_si128(t4,t5);
t9 = _mm_xor_si128(t9,t6);
t9 = _mm_xor_si128(t9,t7);
t0 = _mm_shuffle_epi32(_aes._k.ni.hhhh,78);
t4 = _mm_shuffle_epi32(d1,78);
t0 = _mm_xor_si128(t0,_aes._k.ni.hhhh);
t4 = _mm_xor_si128(t4,d1);
t1 = _mm_shuffle_epi32(_aes._k.ni.hhh,78);
t5 = _mm_shuffle_epi32(d2,78);
t1 = _mm_xor_si128(t1,_aes._k.ni.hhh);
t5 = _mm_xor_si128(t5,d2);
t2 = _mm_shuffle_epi32(_aes._k.ni.hh,78);
t6 = _mm_shuffle_epi32(d3,78);
t2 = _mm_xor_si128(t2,_aes._k.ni.hh);
t6 = _mm_xor_si128(t6,d3);
t3 = _mm_shuffle_epi32(_aes._k.ni.h,78);
t7 = _mm_shuffle_epi32(d4,78);
t3 = _mm_xor_si128(t3,_aes._k.ni.h);
t7 = _mm_xor_si128(t7,d4);
t0 = _mm_clmulepi64_si128(t0,t4,0x00);
t1 = _mm_clmulepi64_si128(t1,t5,0x00);
t2 = _mm_clmulepi64_si128(t2,t6,0x00);
t3 = _mm_clmulepi64_si128(t3,t7,0x00);
t0 = _mm_xor_si128(t0,t8);
t0 = _mm_xor_si128(t0,t9);
t0 = _mm_xor_si128(t1,t0);
t0 = _mm_xor_si128(t2,t0);
t0 = _mm_xor_si128(t3,t0);
t4 = _mm_slli_si128(t0,8);
t0 = _mm_srli_si128(t0,8);
t3 = _mm_xor_si128(t4,t8);
t6 = _mm_xor_si128(t0,t9);
t7 = _mm_srli_epi32(t3,31);
t8 = _mm_srli_epi32(t6,31);
t3 = _mm_slli_epi32(t3,1);
t6 = _mm_slli_epi32(t6,1);
t9 = _mm_srli_si128(t7,12);
t8 = _mm_slli_si128(t8,4);
t7 = _mm_slli_si128(t7,4);
t3 = _mm_or_si128(t3,t7);
t6 = _mm_or_si128(t6,t8);
t6 = _mm_or_si128(t6,t9);
t7 = _mm_slli_epi32(t3,31);
t8 = _mm_slli_epi32(t3,30);
t9 = _mm_slli_epi32(t3,25);
t7 = _mm_xor_si128(t7,t8);
t7 = _mm_xor_si128(t7,t9);
t8 = _mm_srli_si128(t7,4);
t7 = _mm_slli_si128(t7,12);
t3 = _mm_xor_si128(t3,t7);
t2 = _mm_srli_epi32(t3,1);
t4 = _mm_srli_epi32(t3,2);
t5 = _mm_srli_epi32(t3,7);
t2 = _mm_xor_si128(t2,t4);
t2 = _mm_xor_si128(t2,t5);
t2 = _mm_xor_si128(t2,t8);
t3 = _mm_xor_si128(t3,t2);
t6 = _mm_xor_si128(t6,t3);
y = _mm_shuffle_epi8(t6,shuf);
if (likely(len >= 64)) {
const __m128i h = _aes._k.ni.h;
const __m128i hh = _aes._k.ni.hh;
const __m128i hhh = _aes._k.ni.hhh;
const __m128i hhhh = _aes._k.ni.hhhh;
do {
__m128i d1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
__m128i d2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
__m128i d3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
__m128i d4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
in += 64;
len -= 64;
d1 = _mm_shuffle_epi8(_mm_xor_si128(y,d1),shuf);
d2 = _mm_shuffle_epi8(d2,shuf);
d3 = _mm_shuffle_epi8(d3,shuf);
d4 = _mm_shuffle_epi8(d4,shuf);
__m128i t0 = _mm_clmulepi64_si128(hhhh,d1,0x00);
__m128i t1 = _mm_clmulepi64_si128(hhh,d2,0x00);
__m128i t2 = _mm_clmulepi64_si128(hh,d3,0x00);
__m128i t8 = _mm_xor_si128(t0,t1);
t8 = _mm_xor_si128(t8,t2);
__m128i t3 = _mm_clmulepi64_si128(h,d4,0x00);
__m128i t4 = _mm_clmulepi64_si128(hhhh,d1,0x11);
__m128i t5 = _mm_clmulepi64_si128(hhh,d2,0x11);
t8 = _mm_xor_si128(t8,t3);
__m128i t6 = _mm_clmulepi64_si128(hh,d3,0x11);
__m128i t7 = _mm_clmulepi64_si128(h,d4,0x11);
__m128i t9 = _mm_xor_si128(t4,t5);
t9 = _mm_xor_si128(t9,t6);
t9 = _mm_xor_si128(t9,t7);
t0 = _mm_shuffle_epi32(hhhh,78);
t4 = _mm_shuffle_epi32(d1,78);
t0 = _mm_xor_si128(t0,hhhh);
t4 = _mm_xor_si128(t4,d1);
t1 = _mm_shuffle_epi32(hhh,78);
t5 = _mm_shuffle_epi32(d2,78);
t1 = _mm_xor_si128(t1,hhh);
t5 = _mm_xor_si128(t5,d2);
t2 = _mm_shuffle_epi32(hh,78);
t6 = _mm_shuffle_epi32(d3,78);
t2 = _mm_xor_si128(t2,hh);
t6 = _mm_xor_si128(t6,d3);
t3 = _mm_shuffle_epi32(h,78);
t7 = _mm_shuffle_epi32(d4,78);
t3 = _mm_xor_si128(t3,h);
t7 = _mm_xor_si128(t7,d4);
t0 = _mm_clmulepi64_si128(t0,t4,0x00);
t1 = _mm_clmulepi64_si128(t1,t5,0x00);
t2 = _mm_clmulepi64_si128(t2,t6,0x00);
t3 = _mm_clmulepi64_si128(t3,t7,0x00);
t0 = _mm_xor_si128(t0,t8);
t0 = _mm_xor_si128(t0,t9);
t0 = _mm_xor_si128(t1,t0);
t0 = _mm_xor_si128(t2,t0);
t0 = _mm_xor_si128(t3,t0);
t4 = _mm_slli_si128(t0,8);
t0 = _mm_srli_si128(t0,8);
t3 = _mm_xor_si128(t4,t8);
t6 = _mm_xor_si128(t0,t9);
t7 = _mm_srli_epi32(t3,31);
t8 = _mm_srli_epi32(t6,31);
t3 = _mm_slli_epi32(t3,1);
t6 = _mm_slli_epi32(t6,1);
t9 = _mm_srli_si128(t7,12);
t8 = _mm_slli_si128(t8,4);
t7 = _mm_slli_si128(t7,4);
t3 = _mm_or_si128(t3,t7);
t6 = _mm_or_si128(t6,t8);
t6 = _mm_or_si128(t6,t9);
t7 = _mm_slli_epi32(t3,31);
t8 = _mm_slli_epi32(t3,30);
t9 = _mm_slli_epi32(t3,25);
t7 = _mm_xor_si128(t7,t8);
t7 = _mm_xor_si128(t7,t9);
t8 = _mm_srli_si128(t7,4);
t7 = _mm_slli_si128(t7,12);
t3 = _mm_xor_si128(t3,t7);
t2 = _mm_srli_epi32(t3,1);
t4 = _mm_srli_epi32(t3,2);
t5 = _mm_srli_epi32(t3,7);
t2 = _mm_xor_si128(t2,t4);
t2 = _mm_xor_si128(t2,t5);
t2 = _mm_xor_si128(t2,t8);
t3 = _mm_xor_si128(t3,t2);
t6 = _mm_xor_si128(t6,t3);
y = _mm_shuffle_epi8(t6,shuf);
} while (len >= 64);
}
while (len >= 16) {
@ -476,29 +477,13 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
if (likely(Utils::CPUID.aes)) {
uint64_t c0 = _ctr[0];
uint64_t c1 = Utils::ntoh(_ctr[1]);
// This uses some spare XMM registers to hold some of the key.
const __m128i *const k = _aes._k.ni.k;
const __m128i k0 = k[0];
const __m128i k1 = k[1];
const __m128i k2 = k[2];
const __m128i k3 = k[3];
const __m128i k4 = k[4];
const __m128i k5 = k[5];
// Complete any unfinished blocks from previous calls to crypt().
unsigned int totalLen = _len;
if ((totalLen & 15U)) {
const __m128i k7 = k[7];
const __m128i k8 = k[8];
const __m128i k9 = k[9];
const __m128i k10 = k[10];
const __m128i k11 = k[11];
const __m128i k12 = k[12];
const __m128i k13 = k[13];
const __m128i k14 = k[14];
for (;;) {
if (!len) {
if (unlikely(!len)) {
_ctr[0] = c0;
_ctr[1] = Utils::hton(c1);
_len = totalLen;
@ -508,23 +493,23 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
out[totalLen++] = *(in++);
if (!(totalLen & 15U)) {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
d0 = _mm_xor_si128(d0,k0);
d0 = _mm_aesenc_si128(d0,k1);
d0 = _mm_aesenc_si128(d0,k2);
d0 = _mm_aesenc_si128(d0,k3);
d0 = _mm_aesenc_si128(d0,k4);
d0 = _mm_aesenc_si128(d0,k5);
d0 = _mm_xor_si128(d0,k[0]);
d0 = _mm_aesenc_si128(d0,k[1]);
d0 = _mm_aesenc_si128(d0,k[2]);
d0 = _mm_aesenc_si128(d0,k[3]);
d0 = _mm_aesenc_si128(d0,k[4]);
d0 = _mm_aesenc_si128(d0,k[5]);
d0 = _mm_aesenc_si128(d0,k[6]);
d0 = _mm_aesenc_si128(d0,k7);
d0 = _mm_aesenc_si128(d0,k8);
d0 = _mm_aesenc_si128(d0,k9);
d0 = _mm_aesenc_si128(d0,k10);
d0 = _mm_aesenc_si128(d0,k[7]);
d0 = _mm_aesenc_si128(d0,k[8]);
d0 = _mm_aesenc_si128(d0,k[9]);
d0 = _mm_aesenc_si128(d0,k[10]);
__m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16));
d0 = _mm_aesenc_si128(d0,k11);
d0 = _mm_aesenc_si128(d0,k[11]);
const __m128i p0 = _mm_loadu_si128(outblk);
d0 = _mm_aesenc_si128(d0,k12);
d0 = _mm_aesenc_si128(d0,k13);
d0 = _mm_aesenclast_si128(d0,k14);
d0 = _mm_aesenc_si128(d0,k[12]);
d0 = _mm_aesenc_si128(d0,k[13]);
d0 = _mm_aesenclast_si128(d0,k[14]);
_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
break;
}
@ -532,128 +517,236 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
}
out += totalLen;
_len = (totalLen + len);
_len = totalLen + len;
while (len >= 64) {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
c1 += 4;
d0 = _mm_xor_si128(d0,k0);
d1 = _mm_xor_si128(d1,k0);
d2 = _mm_xor_si128(d2,k0);
d3 = _mm_xor_si128(d3,k0);
d0 = _mm_aesenc_si128(d0,k1);
d1 = _mm_aesenc_si128(d1,k1);
d2 = _mm_aesenc_si128(d2,k1);
d3 = _mm_aesenc_si128(d3,k1);
__m128i ka = k[6];
d0 = _mm_aesenc_si128(d0,k2);
d1 = _mm_aesenc_si128(d1,k2);
d2 = _mm_aesenc_si128(d2,k2);
d3 = _mm_aesenc_si128(d3,k2);
__m128i kb = k[7];
d0 = _mm_aesenc_si128(d0,k3);
d1 = _mm_aesenc_si128(d1,k3);
d2 = _mm_aesenc_si128(d2,k3);
d3 = _mm_aesenc_si128(d3,k3);
__m128i kc = k[8];
d0 = _mm_aesenc_si128(d0,k4);
d1 = _mm_aesenc_si128(d1,k4);
d2 = _mm_aesenc_si128(d2,k4);
d3 = _mm_aesenc_si128(d3,k4);
__m128i kd = k[9];
d0 = _mm_aesenc_si128(d0,k5);
d1 = _mm_aesenc_si128(d1,k5);
d2 = _mm_aesenc_si128(d2,k5);
d3 = _mm_aesenc_si128(d3,k5);
__m128i ke = k[10];
d0 = _mm_aesenc_si128(d0,ka);
d1 = _mm_aesenc_si128(d1,ka);
d2 = _mm_aesenc_si128(d2,ka);
d3 = _mm_aesenc_si128(d3,ka);
__m128i kf = k[11];
d0 = _mm_aesenc_si128(d0,kb);
d1 = _mm_aesenc_si128(d1,kb);
d2 = _mm_aesenc_si128(d2,kb);
d3 = _mm_aesenc_si128(d3,kb);
ka = k[12];
d0 = _mm_aesenc_si128(d0,kc);
d1 = _mm_aesenc_si128(d1,kc);
d2 = _mm_aesenc_si128(d2,kc);
d3 = _mm_aesenc_si128(d3,kc);
kb = k[13];
d0 = _mm_aesenc_si128(d0,kd);
d1 = _mm_aesenc_si128(d1,kd);
d2 = _mm_aesenc_si128(d2,kd);
d3 = _mm_aesenc_si128(d3,kd);
kc = k[14];
d0 = _mm_aesenc_si128(d0,ke);
d1 = _mm_aesenc_si128(d1,ke);
d2 = _mm_aesenc_si128(d2,ke);
d3 = _mm_aesenc_si128(d3,ke);
kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
d0 = _mm_aesenc_si128(d0,kf);
d1 = _mm_aesenc_si128(d1,kf);
d2 = _mm_aesenc_si128(d2,kf);
d3 = _mm_aesenc_si128(d3,kf);
ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
d0 = _mm_aesenc_si128(d0,ka);
d1 = _mm_aesenc_si128(d1,ka);
d2 = _mm_aesenc_si128(d2,ka);
d3 = _mm_aesenc_si128(d3,ka);
kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
d0 = _mm_aesenc_si128(d0,kb);
d1 = _mm_aesenc_si128(d1,kb);
d2 = _mm_aesenc_si128(d2,kb);
d3 = _mm_aesenc_si128(d3,kb);
ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
d0 = _mm_aesenclast_si128(d0,kc);
d1 = _mm_aesenclast_si128(d1,kc);
d2 = _mm_aesenclast_si128(d2,kc);
d3 = _mm_aesenclast_si128(d3,kc);
kd = _mm_xor_si128(d0,kd);
ke = _mm_xor_si128(d1,ke);
kf = _mm_xor_si128(d2,kf);
ka = _mm_xor_si128(d3,ka);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
in += 64;
len -= 64;
out += 64;
if (likely(len >= 64)) {
if (Utils::CPUID.vaes) { // is only true if AVX is also present
if ((!Utils::CPUID.avx512f)||((len < 1024))) {
const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]);
const __m256i kk1 = _mm256_broadcastsi128_si256(k[1]);
const __m256i kk2 = _mm256_broadcastsi128_si256(k[2]);
const __m256i kk3 = _mm256_broadcastsi128_si256(k[3]);
const __m256i kk4 = _mm256_broadcastsi128_si256(k[4]);
const __m256i kk5 = _mm256_broadcastsi128_si256(k[5]);
const __m256i kk6 = _mm256_broadcastsi128_si256(k[6]);
const __m256i kk7 = _mm256_broadcastsi128_si256(k[7]);
const __m256i kk8 = _mm256_broadcastsi128_si256(k[8]);
const __m256i kk9 = _mm256_broadcastsi128_si256(k[9]);
const __m256i kk10 = _mm256_broadcastsi128_si256(k[10]);
const __m256i kk11 = _mm256_broadcastsi128_si256(k[11]);
const __m256i kk12 = _mm256_broadcastsi128_si256(k[12]);
const __m256i kk13 = _mm256_broadcastsi128_si256(k[13]);
const __m256i kk14 = _mm256_broadcastsi128_si256(k[14]);
do {
__m256i d0 = _mm256_set_epi64x(
(long long)Utils::hton(c1 + 1ULL),(long long)c0,
(long long)Utils::hton(c1),(long long)c0);
__m256i d1 = _mm256_set_epi64x(
(long long)Utils::hton(c1 + 3ULL),(long long)c0,
(long long)Utils::hton(c1 + 2ULL),(long long)c0);
c1 += 4;
__m256i p0 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(in));
__m256i p1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(in + 32));
in += 64;
d0 = _mm256_xor_si256(d0,kk0);
d1 = _mm256_xor_si256(d1,kk0);
d0 = _mm256_aesenc_epi128(d0,kk1);
d1 = _mm256_aesenc_epi128(d1,kk1);
d0 = _mm256_aesenc_epi128(d0,kk2);
d1 = _mm256_aesenc_epi128(d1,kk2);
d0 = _mm256_aesenc_epi128(d0,kk3);
d1 = _mm256_aesenc_epi128(d1,kk3);
d0 = _mm256_aesenc_epi128(d0,kk4);
d1 = _mm256_aesenc_epi128(d1,kk4);
d0 = _mm256_aesenc_epi128(d0,kk5);
d1 = _mm256_aesenc_epi128(d1,kk5);
d0 = _mm256_aesenc_epi128(d0,kk6);
d1 = _mm256_aesenc_epi128(d1,kk6);
d0 = _mm256_aesenc_epi128(d0,kk7);
d1 = _mm256_aesenc_epi128(d1,kk7);
d0 = _mm256_aesenc_epi128(d0,kk8);
d1 = _mm256_aesenc_epi128(d1,kk8);
d0 = _mm256_aesenc_epi128(d0,kk9);
d1 = _mm256_aesenc_epi128(d1,kk9);
d0 = _mm256_aesenc_epi128(d0,kk10);
d1 = _mm256_aesenc_epi128(d1,kk10);
d0 = _mm256_aesenc_epi128(d0,kk11);
d1 = _mm256_aesenc_epi128(d1,kk11);
d0 = _mm256_aesenc_epi128(d0,kk12);
d1 = _mm256_aesenc_epi128(d1,kk12);
d0 = _mm256_aesenc_epi128(d0,kk13);
d1 = _mm256_aesenc_epi128(d1,kk13);
d0 = _mm256_aesenclast_epi128(d0,kk14);
d1 = _mm256_aesenclast_epi128(d1,kk14);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),_mm256_xor_si256(d0,p0));
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),_mm256_xor_si256(d1,p1));
out += 64;
len -= 64;
} while (len >= 64);
} else {
const __m512i kk0 = _mm512_broadcast_i32x4(k[0]);
const __m512i kk1 = _mm512_broadcast_i32x4(k[1]);
const __m512i kk2 = _mm512_broadcast_i32x4(k[2]);
const __m512i kk3 = _mm512_broadcast_i32x4(k[3]);
const __m512i kk4 = _mm512_broadcast_i32x4(k[4]);
const __m512i kk5 = _mm512_broadcast_i32x4(k[5]);
const __m512i kk6 = _mm512_broadcast_i32x4(k[6]);
const __m512i kk7 = _mm512_broadcast_i32x4(k[7]);
const __m512i kk8 = _mm512_broadcast_i32x4(k[8]);
const __m512i kk9 = _mm512_broadcast_i32x4(k[9]);
const __m512i kk10 = _mm512_broadcast_i32x4(k[10]);
const __m512i kk11 = _mm512_broadcast_i32x4(k[11]);
const __m512i kk12 = _mm512_broadcast_i32x4(k[12]);
const __m512i kk13 = _mm512_broadcast_i32x4(k[13]);
const __m512i kk14 = _mm512_broadcast_i32x4(k[14]);
do {
__m512i d0 = _mm512_set_epi64(
(long long)Utils::hton(c1 + 3ULL),(long long)c0,
(long long)Utils::hton(c1 + 2ULL),(long long)c0,
(long long)Utils::hton(c1 + 1ULL),(long long)c0,
(long long)Utils::hton(c1),(long long)c0);
c1 += 4;
__m512i p0 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(in));
in += 64;
d0 = _mm512_xor_si512(d0,kk0);
d0 = _mm512_aesenc_epi128(d0,kk1);
d0 = _mm512_aesenc_epi128(d0,kk2);
d0 = _mm512_aesenc_epi128(d0,kk3);
d0 = _mm512_aesenc_epi128(d0,kk4);
d0 = _mm512_aesenc_epi128(d0,kk5);
d0 = _mm512_aesenc_epi128(d0,kk6);
d0 = _mm512_aesenc_epi128(d0,kk7);
d0 = _mm512_aesenc_epi128(d0,kk8);
d0 = _mm512_aesenc_epi128(d0,kk9);
d0 = _mm512_aesenc_epi128(d0,kk10);
d0 = _mm512_aesenc_epi128(d0,kk11);
d0 = _mm512_aesenc_epi128(d0,kk12);
d0 = _mm512_aesenc_epi128(d0,kk13);
d0 = _mm512_aesenclast_epi128(d0,kk14);
_mm512_storeu_si512(reinterpret_cast<__m512i *>(out),_mm512_xor_si512(p0,d0));
out += 64;
len -= 64;
} while (len >= 64);
}
} else {
const __m128i k0 = k[0];
const __m128i k1 = k[1];
const __m128i k2 = k[2];
const __m128i k3 = k[3];
const __m128i k4 = k[4];
const __m128i k5 = k[5];
const __m128i k6 = k[6];
const __m128i k7 = k[7];
const __m128i k8 = k[8];
const __m128i k9 = k[9];
const __m128i k10 = k[10];
const __m128i k11 = k[11];
const __m128i k12 = k[12];
const __m128i k13 = k[13];
const __m128i k14 = k[14];
do {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
c1 += 4;
d0 = _mm_xor_si128(d0,k0);
d1 = _mm_xor_si128(d1,k0);
d2 = _mm_xor_si128(d2,k0);
d3 = _mm_xor_si128(d3,k0);
d0 = _mm_aesenc_si128(d0,k1);
d1 = _mm_aesenc_si128(d1,k1);
d2 = _mm_aesenc_si128(d2,k1);
d3 = _mm_aesenc_si128(d3,k1);
d0 = _mm_aesenc_si128(d0,k2);
d1 = _mm_aesenc_si128(d1,k2);
d2 = _mm_aesenc_si128(d2,k2);
d3 = _mm_aesenc_si128(d3,k2);
d0 = _mm_aesenc_si128(d0,k3);
d1 = _mm_aesenc_si128(d1,k3);
d2 = _mm_aesenc_si128(d2,k3);
d3 = _mm_aesenc_si128(d3,k3);
d0 = _mm_aesenc_si128(d0,k4);
d1 = _mm_aesenc_si128(d1,k4);
d2 = _mm_aesenc_si128(d2,k4);
d3 = _mm_aesenc_si128(d3,k4);
d0 = _mm_aesenc_si128(d0,k5);
d1 = _mm_aesenc_si128(d1,k5);
d2 = _mm_aesenc_si128(d2,k5);
d3 = _mm_aesenc_si128(d3,k5);
d0 = _mm_aesenc_si128(d0,k6);
d1 = _mm_aesenc_si128(d1,k6);
d2 = _mm_aesenc_si128(d2,k6);
d3 = _mm_aesenc_si128(d3,k6);
d0 = _mm_aesenc_si128(d0,k7);
d1 = _mm_aesenc_si128(d1,k7);
d2 = _mm_aesenc_si128(d2,k7);
d3 = _mm_aesenc_si128(d3,k7);
d0 = _mm_aesenc_si128(d0,k8);
d1 = _mm_aesenc_si128(d1,k8);
d2 = _mm_aesenc_si128(d2,k8);
d3 = _mm_aesenc_si128(d3,k8);
d0 = _mm_aesenc_si128(d0,k9);
d1 = _mm_aesenc_si128(d1,k9);
d2 = _mm_aesenc_si128(d2,k9);
d3 = _mm_aesenc_si128(d3,k9);
d0 = _mm_aesenc_si128(d0,k10);
d1 = _mm_aesenc_si128(d1,k10);
d2 = _mm_aesenc_si128(d2,k10);
d3 = _mm_aesenc_si128(d3,k10);
__m128i p0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
d0 = _mm_aesenc_si128(d0,k11);
d1 = _mm_aesenc_si128(d1,k11);
d2 = _mm_aesenc_si128(d2,k11);
d3 = _mm_aesenc_si128(d3,k11);
__m128i p1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
d0 = _mm_aesenc_si128(d0,k12);
d1 = _mm_aesenc_si128(d1,k12);
d2 = _mm_aesenc_si128(d2,k12);
d3 = _mm_aesenc_si128(d3,k12);
__m128i p2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
d0 = _mm_aesenc_si128(d0,k13);
d1 = _mm_aesenc_si128(d1,k13);
d2 = _mm_aesenc_si128(d2,k13);
d3 = _mm_aesenc_si128(d3,k13);
__m128i p3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
in += 64;
d0 = _mm_aesenclast_si128(d0,k14);
d1 = _mm_aesenclast_si128(d1,k14);
d2 = _mm_aesenclast_si128(d2,k14);
d3 = _mm_aesenclast_si128(d3,k14);
p0 = _mm_xor_si128(d0,p0);
p1 = _mm_xor_si128(d1,p1);
p2 = _mm_xor_si128(d2,p2);
p3 = _mm_xor_si128(d3,p3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),p0);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),p1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),p2);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),p3);
out += 64;
len -= 64;
} while (len >= 64);
}
}
if (len >= 16) {
const __m128i k7 = k[7];
const __m128i k8 = k[8];
const __m128i k9 = k[9];
const __m128i k10 = k[10];
const __m128i k11 = k[11];
const __m128i k12 = k[12];
const __m128i k13 = k[13];
const __m128i k14 = k[14];
do {
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
d0 = _mm_xor_si128(d0,k0);
d0 = _mm_aesenc_si128(d0,k1);
d0 = _mm_aesenc_si128(d0,k2);
d0 = _mm_aesenc_si128(d0,k3);
d0 = _mm_aesenc_si128(d0,k4);
d0 = _mm_aesenc_si128(d0,k5);
d0 = _mm_xor_si128(d0,k[0]);
d0 = _mm_aesenc_si128(d0,k[1]);
d0 = _mm_aesenc_si128(d0,k[2]);
d0 = _mm_aesenc_si128(d0,k[3]);
d0 = _mm_aesenc_si128(d0,k[4]);
d0 = _mm_aesenc_si128(d0,k[5]);
d0 = _mm_aesenc_si128(d0,k[6]);
d0 = _mm_aesenc_si128(d0,k7);
d0 = _mm_aesenc_si128(d0,k8);
d0 = _mm_aesenc_si128(d0,k9);
d0 = _mm_aesenc_si128(d0,k10);
d0 = _mm_aesenc_si128(d0,k11);
d0 = _mm_aesenc_si128(d0,k12);
d0 = _mm_aesenc_si128(d0,k13);
d0 = _mm_aesenclast_si128(d0,k14);
d0 = _mm_aesenc_si128(d0,k[7]);
d0 = _mm_aesenc_si128(d0,k[8]);
d0 = _mm_aesenc_si128(d0,k[9]);
d0 = _mm_aesenc_si128(d0,k[10]);
d0 = _mm_aesenc_si128(d0,k[11]);
d0 = _mm_aesenc_si128(d0,k[12]);
d0 = _mm_aesenc_si128(d0,k[13]);
d0 = _mm_aesenclast_si128(d0,k[14]);
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
in += 16;
len -= 16;

View file

@ -15,6 +15,7 @@
#include "Identity.hpp"
#include "SHA512.hpp"
#include "Salsa20.hpp"
#include "Poly1305.hpp"
#include "Utils.hpp"
#include <algorithm>
@ -39,14 +40,14 @@ void identityV0ProofOfWorkFrankenhash(const void *const publicKey, unsigned int
s20.crypt20((char *) genmem, (char *) genmem, 64);
for (unsigned long i = 64;i < ZT_V0_IDENTITY_GEN_MEMORY;i += 64) {
unsigned long k = i - 64;
*((uint64_t *) ((char *) genmem + i)) = *((uint64_t *) ((char *) genmem + k));
*((uint64_t *) ((char *) genmem + i + 8)) = *((uint64_t *) ((char *) genmem + k + 8));
*((uint64_t *) ((char *) genmem + i + 16)) = *((uint64_t *) ((char *) genmem + k + 16));
*((uint64_t *) ((char *) genmem + i + 24)) = *((uint64_t *) ((char *) genmem + k + 24));
*((uint64_t *) ((char *) genmem + i + 32)) = *((uint64_t *) ((char *) genmem + k + 32));
*((uint64_t *) ((char *) genmem + i + 40)) = *((uint64_t *) ((char *) genmem + k + 40));
*((uint64_t *) ((char *) genmem + i + 48)) = *((uint64_t *) ((char *) genmem + k + 48));
*((uint64_t *) ((char *) genmem + i + 56)) = *((uint64_t *) ((char *) genmem + k + 56));
*((uint64_t * )((char *) genmem + i)) = *((uint64_t * )((char *) genmem + k));
*((uint64_t * )((char *) genmem + i + 8)) = *((uint64_t * )((char *) genmem + k + 8));
*((uint64_t * )((char *) genmem + i + 16)) = *((uint64_t * )((char *) genmem + k + 16));
*((uint64_t * )((char *) genmem + i + 24)) = *((uint64_t * )((char *) genmem + k + 24));
*((uint64_t * )((char *) genmem + i + 32)) = *((uint64_t * )((char *) genmem + k + 32));
*((uint64_t * )((char *) genmem + i + 40)) = *((uint64_t * )((char *) genmem + k + 40));
*((uint64_t * )((char *) genmem + i + 48)) = *((uint64_t * )((char *) genmem + k + 48));
*((uint64_t * )((char *) genmem + i + 56)) = *((uint64_t * )((char *) genmem + k + 56));
s20.crypt20((char *) genmem + i, (char *) genmem + i, 64);
}
@ -78,49 +79,58 @@ struct identityV0ProofOfWorkCriteria
#define ZT_IDENTITY_V1_POW_MEMORY_SIZE 131072
// This is a simpler memory-intensive hash function for V1 identity generation.
// It's not quite as heavy as the V0 frankenhash, is a little more orderly in
// its design, but remains relatively resistant to GPU acceleration due to memory
// requirements for efficient computation.
struct p_CompareLittleEndian
{
#if __BYTE_ORDER == __BIG_ENDIAN
ZT_INLINE bool operator()(const uint64_t a,const uint64_t b) const noexcept { return Utils::swapBytes(a) < Utils::swapBytes(b); }
#else
ZT_INLINE bool operator()(const uint64_t a,const uint64_t b) const noexcept { return a < b; }
#endif
};
// This is a simpler memory-intensive frankenhash for V1 identity generation.
bool identityV1ProofOfWorkCriteria(const void *in, const unsigned int len)
{
uint64_t b[ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8];
uint64_t w[ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8];
SHA384(b, in, len);
Utils::zero<ZT_IDENTITY_V1_POW_MEMORY_SIZE - 48>(b + 6);
Salsa20(b,b + 4).crypt12(b,b,ZT_IDENTITY_V1_POW_MEMORY_SIZE);
#if __BYTE_ORDER == __BIG_ENDIAN
for (unsigned int i=0;i<(ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8);) {
const unsigned int i1 = i + 1;
const unsigned int i2 = i + 2;
const unsigned int i3 = i + 3;
b[i] = Utils::swapBytes(b[i]);
i += 4;
b[i1] = Utils::swapBytes(b[i1]);
b[i2] = Utils::swapBytes(b[i2]);
b[i3] = Utils::swapBytes(b[i3]);
// Fill work buffer with pseudorandom bytes using a construction that should be
// relatively hostile to GPU acceleration. GPUs usually implement branching by
// executing all branches and then selecting the answer, which means this
// construction should require a GPU to do ~3X the work of a CPU per iteration.
SHA512(w, in, len);
for (unsigned int i = 8, j = 0;i < (ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8);) {
uint64_t *const ww = w + i;
const uint64_t *const wp = w + j;
i += 8;
j += 8;
if ((wp[0] & 7U) == 0) {
SHA512(ww, wp, 64);
} else if ((wp[1] & 15U) == 0) {
ww[0] = Utils::hton(Utils::ntoh(wp[0]) % 4503599627370101ULL);
ww[1] = Utils::hton(Utils::ntoh(wp[1]) % 4503599627370161ULL);
ww[2] = Utils::hton(Utils::ntoh(wp[2]) % 4503599627370227ULL);
ww[3] = Utils::hton(Utils::ntoh(wp[3]) % 4503599627370287ULL);
ww[4] = Utils::hton(Utils::ntoh(wp[4]) % 4503599627370299ULL);
ww[5] = Utils::hton(Utils::ntoh(wp[5]) % 4503599627370323ULL);
ww[6] = Utils::hton(Utils::ntoh(wp[6]) % 4503599627370353ULL);
ww[7] = Utils::hton(Utils::ntoh(wp[7]) % 4503599627370449ULL);
SHA384(ww, wp, 128);
} else {
Salsa20(wp, wp + 4).crypt12(wp, ww, 64);
}
}
#endif
std::sort(b,b + (ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8));
// Sort 64-bit integers (little-endian) into ascending order and compute a
// cryptographic checksum. Sorting makes the order of values dependent on all
// other values, making a speed competitive implementation that skips on the
// memory requirement extremely hard.
std::sort(w, w + (ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8), p_CompareLittleEndian());
Poly1305::compute(w, w, ZT_IDENTITY_V1_POW_MEMORY_SIZE, w);
#if __BYTE_ORDER == __BIG_ENDIAN
for (unsigned int i=0;i<(ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8);) {
const unsigned int i1 = i + 1;
const unsigned int i2 = i + 2;
const unsigned int i3 = i + 3;
b[i] = Utils::swapBytes(b[i]);
i += 4;
b[i1] = Utils::swapBytes(b[i1]);
b[i2] = Utils::swapBytes(b[i2]);
b[i3] = Utils::swapBytes(b[i3]);
}
#endif
SHA384(b, b, ZT_IDENTITY_V1_POW_MEMORY_SIZE, in, len);
return (b[0] % 1093U) == 0;
// PoW criteria passed if this is true. The value 593 was chosen experimentally
// to yield a good average performance balancing fast setup with intentional
// identity collision resistance.
return (Utils::ntoh(w[0]) % 593U) == 0;
}
} // anonymous namespace
@ -145,7 +155,7 @@ bool Identity::generate(const Type t)
address.setTo(digest + 59);
} while (address.isReserved());
delete[] genmem;
m_fp.m_cfp.address = address.toInt();
m_fp.m_cfp.address = address.toInt(); // address comes from PoW hash for type 0 identities
m_computeHash();
} break;
@ -167,6 +177,7 @@ bool Identity::generate(const Type t)
// If we passed PoW then check that the address is valid, otherwise loop
// back around and run the whole process again.
m_computeHash();
m_fp.m_cfp.address = Address(m_fp.m_cfp.hash).toInt();
if (!m_fp.address().isReserved())
break;
}
@ -351,10 +362,8 @@ bool Identity::fromString(const char *str)
case 0:
m_fp.m_cfp.address = Utils::hexStrToU64(f) & ZT_ADDRESS_MASK;
if (m_fp.address().isReserved()) {
memoryZero(this);
if (m_fp.address().isReserved())
return false;
}
break;
case 1:
@ -363,7 +372,6 @@ bool Identity::fromString(const char *str)
} else if ((f[0] == '1') && (!f[1])) {
m_type = P384;
} else {
memoryZero(this);
return false;
}
break;
@ -372,17 +380,13 @@ bool Identity::fromString(const char *str)
switch (m_type) {
case C25519:
if (Utils::unhex(f, strlen(f), m_pub, ZT_C25519_COMBINED_PUBLIC_KEY_SIZE) != ZT_C25519_COMBINED_PUBLIC_KEY_SIZE) {
memoryZero(this);
if (Utils::unhex(f, strlen(f), m_pub, ZT_C25519_COMBINED_PUBLIC_KEY_SIZE) != ZT_C25519_COMBINED_PUBLIC_KEY_SIZE)
return false;
}
break;
case P384:
if (Utils::b32d(f, m_pub, sizeof(m_pub)) != sizeof(m_pub)) {
memoryZero(this);
if (Utils::b32d(f, m_pub, sizeof(m_pub)) != sizeof(m_pub))
return false;
}
break;
}
@ -394,7 +398,6 @@ bool Identity::fromString(const char *str)
case C25519:
if (Utils::unhex(f, strlen(f), m_priv, ZT_C25519_COMBINED_PRIVATE_KEY_SIZE) != ZT_C25519_COMBINED_PRIVATE_KEY_SIZE) {
memoryZero(this);
return false;
} else {
m_hasPrivate = true;
@ -403,7 +406,6 @@ bool Identity::fromString(const char *str)
case P384:
if (Utils::b32d(f, m_priv, sizeof(m_priv)) != sizeof(m_priv)) {
memoryZero(this);
return false;
} else {
m_hasPrivate = true;
@ -417,16 +419,12 @@ bool Identity::fromString(const char *str)
}
}
if (fno < 3) {
memoryZero(this);
if (fno < 3)
return false;
}
m_computeHash();
if ((m_type == P384) && (m_fp.address() != Address(m_fp.hash()))) {
memoryZero(this);
if ((m_type == P384) && (m_fp.address() != Address(m_fp.hash())))
return false;
}
return true;
}
@ -533,7 +531,6 @@ void Identity::m_computeHash()
break;
case P384:
SHA384(m_fp.m_cfp.hash, m_pub, sizeof(m_pub));
m_fp.m_cfp.address = Address(m_fp.m_cfp.hash).toInt();
break;
}
}

View file

@ -100,7 +100,11 @@
#if (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64))
#define ZT_ARCH_X64 1
#include <xmmintrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#endif
#if defined(ZT_ARCH_X64) || defined(i386) || defined(__i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_) || defined(__I86__) || defined(__INTEL__) || defined(__386)
#define ZT_ARCH_X86 1
#endif

View file

@ -176,7 +176,7 @@ static const C25519TestVector C25519_TEST_VECTORS[ZT_NUM_C25519_TEST_VECTORS] =
};
#define IDENTITY_V0_KNOWN_GOOD_0 "8e4df28b72:0:ac3d46abe0c21f3cfe7a6c8d6a85cfcffcb82fbd55af6a4d6350657c68200843fa2e16f9418bbd9702cae365f2af5fb4c420908b803a681d4daef6114d78a2d7:bd8dd6e4ce7022d2f812797a80c6ee8ad180dc4ebf301dec8b06d1be08832bddd63a2f1cfa7b2c504474c75bdc8898ba476ef92e8e2d0509f8441985171ff16e"
#define IDENTITY_V1_KNOWN_GOOD_0 "2d48f7a238:1:gltupn4yrt226o3vebl7m7m5hpndhvfz66nzx6gwgtgbsgs5xr7dpz5aiv636zijrxayuu2ydpff4zgho7o6gpvx62njwkavqordxcceajs2fif4y2ytofpyr25mmxmanbf4fmdiitiq2b53nmx4ckjcmtyqrkqye2jkdainmkqbtil3dhyuiwa:xg73bkrxptymo7kyyd6efu2o7ziemyu3lpgtip53ejsqukt6l2gebq5uofzt6cd2455st5iwrdgc2ft3twkdzrkunu6x5imdz6jt27qopsvqpdijx5cqgukpjxrtyx73j42socym5pi5hy2ir5yma7by4gmtjgvvu3sxbb3qv2yuicykyz2q"
#define IDENTITY_V1_KNOWN_GOOD_0 "b0c2badfeb:1:sueysfvujydbkwykbdfemkm5cjgpezjdrzvfczmmfwd2i2ffrrasybhqkz5xegfrrumoidwqyuovprplysmbhtmkim2whjvivub5tcubakzzkhejhqsaiajcu3eooywx3r7sxyflok7b4lgwjv4qqeahkhh4uwog6ke3yqaie2jp3b4wf2pvo2y:xwfmcy2ptfocxnldnkdhzgo4xj73peve3c4ijnlnr442boef7xin34huerixeoes6jsq5g26rvtngjmhqopim7jxssfkw57z2vxidxkutcr4jzu7mmjpnvixwvmbo26nfbd3albf3fyfzi3py6o4bzcnh7thskzvuks5adscqjnseoajjdka"
// --------------------------------------------------------------------------------------------------------------------
@ -695,10 +695,7 @@ extern "C" const char *ZTT_general()
Utils::scopy(tmp,sizeof(tmp),IDENTITY_V1_KNOWN_GOOD_0);
tmp[0] = '0';
if (id.fromString(tmp)) {
ZT_T_PRINTF("FAILED (parse of known-bad identity returned ok)" ZT_EOL_S);
return "Identity test failed: parse of known-bad identity";
}
id.fromString(tmp);
if (id.locallyValidate()) {
ZT_T_PRINTF("FAILED (validation of known-bad identity returned ok)" ZT_EOL_S);
return "Identity test failed: validation of known-bad identity";

View file

@ -35,6 +35,7 @@ namespace Utils {
#ifdef ZT_ARCH_X64
CPUIDRegisters::CPUIDRegisters() noexcept
{
uint32_t eax,ebx,ecx,edx;
#ifdef __WINDOWS__
int regs[4];
__cpuid(regs,1);
@ -50,7 +51,23 @@ CPUIDRegisters::CPUIDRegisters() noexcept
);
#endif
rdrand = ((ecx & (1U << 30U)) != 0);
aes = ( ((ecx & (1U << 25U)) != 0) && ((ecx & (1U << 19U)) != 0) && ((ecx & (1U << 1U)) != 0) ); // AES, PCLMUL, SSE4.1
aes = ( ((ecx & (1U << 25U)) != 0) && ((ecx & (1U << 19U)) != 0) && ((ecx & (1U << 1U)) != 0) );
avx = ((ecx & (1U << 25U)) != 0);
#ifdef __WINDOWS__
TODO
#else
__asm__ __volatile__ (
"cpuid"
: "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx)
: "a"(7),"c"(0)
);
#endif
vaes = aes && avx && ((ecx & (1U << 9U)) != 0);
vpclmulqdq = aes && avx && ((ecx & (1U << 10U)) != 0);
avx2 = avx && ((ebx & (1U << 5U)) != 0);
avx512f = avx && ((ebx & (1U << 16U)) != 0);
sha = ((ebx & (1U << 29U)) != 0);
fsrm = sha = ((edx & (1U << 4U)) != 0);
}
const CPUIDRegisters CPUID;
#endif

View file

@ -16,12 +16,6 @@
#include "Constants.hpp"
#ifdef ZT_ARCH_X64
#include <xmmintrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#endif
#include <utility>
#include <algorithm>
#include <memory>
@ -60,9 +54,15 @@ namespace Utils {
struct CPUIDRegisters
{
CPUIDRegisters() noexcept;
uint32_t eax,ebx,ecx,edx;
bool rdrand;
bool aes;
bool avx;
bool vaes; // implies AVX
bool vpclmulqdq; // implies AVX
bool avx2;
bool avx512f;
bool sha;
bool fsrm;
};
extern const CPUIDRegisters CPUID;
#endif