mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-06-05 20:13:44 +02:00
Blazing fast VAES (256 and 512) AES-CTR, Identity fixes, test fixes.
This commit is contained in:
parent
d537428421
commit
aad21cf395
7 changed files with 419 additions and 311 deletions
|
@ -96,7 +96,7 @@ if (
|
|||
CMAKE_SYSTEM_PROCESSOR MATCHES "amd64"
|
||||
)
|
||||
message("++ Adding SSE and AES-NI flags for processor ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
add_compile_options(-maes -mrdrnd -mpclmul -msse -msse2 -mssse3)
|
||||
add_compile_options(-maes -mrdrnd -mpclmul -msse -msse2 -mssse3 -msse4 -mavx -mavx2 -mavx512f -mvaes)
|
||||
endif()
|
||||
|
||||
add_subdirectory(node)
|
||||
|
|
557
node/AES.cpp
557
node/AES.cpp
|
@ -14,8 +14,6 @@
|
|||
#include "Constants.hpp"
|
||||
#include "AES.hpp"
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
namespace ZeroTier {
|
||||
|
||||
// GMAC ---------------------------------------------------------------------------------------------------------------
|
||||
|
@ -191,90 +189,93 @@ void AES::GMAC::update(const void *const data,unsigned int len) noexcept
|
|||
}
|
||||
}
|
||||
|
||||
while (len >= 64) {
|
||||
__m128i d1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
__m128i d2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
__m128i d3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
__m128i d4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
|
||||
in += 64;
|
||||
len -= 64;
|
||||
|
||||
// This does 4X parallel mult_block via instruction level parallelism.
|
||||
d1 = _mm_shuffle_epi8(_mm_xor_si128(y,d1),shuf);
|
||||
d2 = _mm_shuffle_epi8(d2,shuf);
|
||||
d3 = _mm_shuffle_epi8(d3,shuf);
|
||||
d4 = _mm_shuffle_epi8(d4,shuf);
|
||||
__m128i t0 = _mm_clmulepi64_si128(_aes._k.ni.hhhh,d1,0x00);
|
||||
__m128i t1 = _mm_clmulepi64_si128(_aes._k.ni.hhh,d2,0x00);
|
||||
__m128i t2 = _mm_clmulepi64_si128(_aes._k.ni.hh,d3,0x00);
|
||||
__m128i t3 = _mm_clmulepi64_si128(_aes._k.ni.h,d4,0x00);
|
||||
__m128i t8 = _mm_xor_si128(t0,t1);
|
||||
t8 = _mm_xor_si128(t8,t2);
|
||||
t8 = _mm_xor_si128(t8,t3);
|
||||
__m128i t4 = _mm_clmulepi64_si128(_aes._k.ni.hhhh,d1,0x11);
|
||||
__m128i t5 = _mm_clmulepi64_si128(_aes._k.ni.hhh,d2,0x11);
|
||||
__m128i t6 = _mm_clmulepi64_si128(_aes._k.ni.hh,d3,0x11);
|
||||
__m128i t7 = _mm_clmulepi64_si128(_aes._k.ni.h,d4,0x11);
|
||||
__m128i t9 = _mm_xor_si128(t4,t5);
|
||||
t9 = _mm_xor_si128(t9,t6);
|
||||
t9 = _mm_xor_si128(t9,t7);
|
||||
t0 = _mm_shuffle_epi32(_aes._k.ni.hhhh,78);
|
||||
t4 = _mm_shuffle_epi32(d1,78);
|
||||
t0 = _mm_xor_si128(t0,_aes._k.ni.hhhh);
|
||||
t4 = _mm_xor_si128(t4,d1);
|
||||
t1 = _mm_shuffle_epi32(_aes._k.ni.hhh,78);
|
||||
t5 = _mm_shuffle_epi32(d2,78);
|
||||
t1 = _mm_xor_si128(t1,_aes._k.ni.hhh);
|
||||
t5 = _mm_xor_si128(t5,d2);
|
||||
t2 = _mm_shuffle_epi32(_aes._k.ni.hh,78);
|
||||
t6 = _mm_shuffle_epi32(d3,78);
|
||||
t2 = _mm_xor_si128(t2,_aes._k.ni.hh);
|
||||
t6 = _mm_xor_si128(t6,d3);
|
||||
t3 = _mm_shuffle_epi32(_aes._k.ni.h,78);
|
||||
t7 = _mm_shuffle_epi32(d4,78);
|
||||
t3 = _mm_xor_si128(t3,_aes._k.ni.h);
|
||||
t7 = _mm_xor_si128(t7,d4);
|
||||
t0 = _mm_clmulepi64_si128(t0,t4,0x00);
|
||||
t1 = _mm_clmulepi64_si128(t1,t5,0x00);
|
||||
t2 = _mm_clmulepi64_si128(t2,t6,0x00);
|
||||
t3 = _mm_clmulepi64_si128(t3,t7,0x00);
|
||||
t0 = _mm_xor_si128(t0,t8);
|
||||
t0 = _mm_xor_si128(t0,t9);
|
||||
t0 = _mm_xor_si128(t1,t0);
|
||||
t0 = _mm_xor_si128(t2,t0);
|
||||
t0 = _mm_xor_si128(t3,t0);
|
||||
t4 = _mm_slli_si128(t0,8);
|
||||
t0 = _mm_srli_si128(t0,8);
|
||||
t3 = _mm_xor_si128(t4,t8);
|
||||
t6 = _mm_xor_si128(t0,t9);
|
||||
t7 = _mm_srli_epi32(t3,31);
|
||||
t8 = _mm_srli_epi32(t6,31);
|
||||
t3 = _mm_slli_epi32(t3,1);
|
||||
t6 = _mm_slli_epi32(t6,1);
|
||||
t9 = _mm_srli_si128(t7,12);
|
||||
t8 = _mm_slli_si128(t8,4);
|
||||
t7 = _mm_slli_si128(t7,4);
|
||||
t3 = _mm_or_si128(t3,t7);
|
||||
t6 = _mm_or_si128(t6,t8);
|
||||
t6 = _mm_or_si128(t6,t9);
|
||||
t7 = _mm_slli_epi32(t3,31);
|
||||
t8 = _mm_slli_epi32(t3,30);
|
||||
t9 = _mm_slli_epi32(t3,25);
|
||||
t7 = _mm_xor_si128(t7,t8);
|
||||
t7 = _mm_xor_si128(t7,t9);
|
||||
t8 = _mm_srli_si128(t7,4);
|
||||
t7 = _mm_slli_si128(t7,12);
|
||||
t3 = _mm_xor_si128(t3,t7);
|
||||
t2 = _mm_srli_epi32(t3,1);
|
||||
t4 = _mm_srli_epi32(t3,2);
|
||||
t5 = _mm_srli_epi32(t3,7);
|
||||
t2 = _mm_xor_si128(t2,t4);
|
||||
t2 = _mm_xor_si128(t2,t5);
|
||||
t2 = _mm_xor_si128(t2,t8);
|
||||
t3 = _mm_xor_si128(t3,t2);
|
||||
t6 = _mm_xor_si128(t6,t3);
|
||||
y = _mm_shuffle_epi8(t6,shuf);
|
||||
if (likely(len >= 64)) {
|
||||
const __m128i h = _aes._k.ni.h;
|
||||
const __m128i hh = _aes._k.ni.hh;
|
||||
const __m128i hhh = _aes._k.ni.hhh;
|
||||
const __m128i hhhh = _aes._k.ni.hhhh;
|
||||
do {
|
||||
__m128i d1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
__m128i d2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
__m128i d3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
__m128i d4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
in += 64;
|
||||
len -= 64;
|
||||
d1 = _mm_shuffle_epi8(_mm_xor_si128(y,d1),shuf);
|
||||
d2 = _mm_shuffle_epi8(d2,shuf);
|
||||
d3 = _mm_shuffle_epi8(d3,shuf);
|
||||
d4 = _mm_shuffle_epi8(d4,shuf);
|
||||
__m128i t0 = _mm_clmulepi64_si128(hhhh,d1,0x00);
|
||||
__m128i t1 = _mm_clmulepi64_si128(hhh,d2,0x00);
|
||||
__m128i t2 = _mm_clmulepi64_si128(hh,d3,0x00);
|
||||
__m128i t8 = _mm_xor_si128(t0,t1);
|
||||
t8 = _mm_xor_si128(t8,t2);
|
||||
__m128i t3 = _mm_clmulepi64_si128(h,d4,0x00);
|
||||
__m128i t4 = _mm_clmulepi64_si128(hhhh,d1,0x11);
|
||||
__m128i t5 = _mm_clmulepi64_si128(hhh,d2,0x11);
|
||||
t8 = _mm_xor_si128(t8,t3);
|
||||
__m128i t6 = _mm_clmulepi64_si128(hh,d3,0x11);
|
||||
__m128i t7 = _mm_clmulepi64_si128(h,d4,0x11);
|
||||
__m128i t9 = _mm_xor_si128(t4,t5);
|
||||
t9 = _mm_xor_si128(t9,t6);
|
||||
t9 = _mm_xor_si128(t9,t7);
|
||||
t0 = _mm_shuffle_epi32(hhhh,78);
|
||||
t4 = _mm_shuffle_epi32(d1,78);
|
||||
t0 = _mm_xor_si128(t0,hhhh);
|
||||
t4 = _mm_xor_si128(t4,d1);
|
||||
t1 = _mm_shuffle_epi32(hhh,78);
|
||||
t5 = _mm_shuffle_epi32(d2,78);
|
||||
t1 = _mm_xor_si128(t1,hhh);
|
||||
t5 = _mm_xor_si128(t5,d2);
|
||||
t2 = _mm_shuffle_epi32(hh,78);
|
||||
t6 = _mm_shuffle_epi32(d3,78);
|
||||
t2 = _mm_xor_si128(t2,hh);
|
||||
t6 = _mm_xor_si128(t6,d3);
|
||||
t3 = _mm_shuffle_epi32(h,78);
|
||||
t7 = _mm_shuffle_epi32(d4,78);
|
||||
t3 = _mm_xor_si128(t3,h);
|
||||
t7 = _mm_xor_si128(t7,d4);
|
||||
t0 = _mm_clmulepi64_si128(t0,t4,0x00);
|
||||
t1 = _mm_clmulepi64_si128(t1,t5,0x00);
|
||||
t2 = _mm_clmulepi64_si128(t2,t6,0x00);
|
||||
t3 = _mm_clmulepi64_si128(t3,t7,0x00);
|
||||
t0 = _mm_xor_si128(t0,t8);
|
||||
t0 = _mm_xor_si128(t0,t9);
|
||||
t0 = _mm_xor_si128(t1,t0);
|
||||
t0 = _mm_xor_si128(t2,t0);
|
||||
t0 = _mm_xor_si128(t3,t0);
|
||||
t4 = _mm_slli_si128(t0,8);
|
||||
t0 = _mm_srli_si128(t0,8);
|
||||
t3 = _mm_xor_si128(t4,t8);
|
||||
t6 = _mm_xor_si128(t0,t9);
|
||||
t7 = _mm_srli_epi32(t3,31);
|
||||
t8 = _mm_srli_epi32(t6,31);
|
||||
t3 = _mm_slli_epi32(t3,1);
|
||||
t6 = _mm_slli_epi32(t6,1);
|
||||
t9 = _mm_srli_si128(t7,12);
|
||||
t8 = _mm_slli_si128(t8,4);
|
||||
t7 = _mm_slli_si128(t7,4);
|
||||
t3 = _mm_or_si128(t3,t7);
|
||||
t6 = _mm_or_si128(t6,t8);
|
||||
t6 = _mm_or_si128(t6,t9);
|
||||
t7 = _mm_slli_epi32(t3,31);
|
||||
t8 = _mm_slli_epi32(t3,30);
|
||||
t9 = _mm_slli_epi32(t3,25);
|
||||
t7 = _mm_xor_si128(t7,t8);
|
||||
t7 = _mm_xor_si128(t7,t9);
|
||||
t8 = _mm_srli_si128(t7,4);
|
||||
t7 = _mm_slli_si128(t7,12);
|
||||
t3 = _mm_xor_si128(t3,t7);
|
||||
t2 = _mm_srli_epi32(t3,1);
|
||||
t4 = _mm_srli_epi32(t3,2);
|
||||
t5 = _mm_srli_epi32(t3,7);
|
||||
t2 = _mm_xor_si128(t2,t4);
|
||||
t2 = _mm_xor_si128(t2,t5);
|
||||
t2 = _mm_xor_si128(t2,t8);
|
||||
t3 = _mm_xor_si128(t3,t2);
|
||||
t6 = _mm_xor_si128(t6,t3);
|
||||
y = _mm_shuffle_epi8(t6,shuf);
|
||||
} while (len >= 64);
|
||||
}
|
||||
|
||||
while (len >= 16) {
|
||||
|
@ -476,29 +477,13 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
if (likely(Utils::CPUID.aes)) {
|
||||
uint64_t c0 = _ctr[0];
|
||||
uint64_t c1 = Utils::ntoh(_ctr[1]);
|
||||
|
||||
// This uses some spare XMM registers to hold some of the key.
|
||||
const __m128i *const k = _aes._k.ni.k;
|
||||
const __m128i k0 = k[0];
|
||||
const __m128i k1 = k[1];
|
||||
const __m128i k2 = k[2];
|
||||
const __m128i k3 = k[3];
|
||||
const __m128i k4 = k[4];
|
||||
const __m128i k5 = k[5];
|
||||
|
||||
// Complete any unfinished blocks from previous calls to crypt().
|
||||
unsigned int totalLen = _len;
|
||||
if ((totalLen & 15U)) {
|
||||
const __m128i k7 = k[7];
|
||||
const __m128i k8 = k[8];
|
||||
const __m128i k9 = k[9];
|
||||
const __m128i k10 = k[10];
|
||||
const __m128i k11 = k[11];
|
||||
const __m128i k12 = k[12];
|
||||
const __m128i k13 = k[13];
|
||||
const __m128i k14 = k[14];
|
||||
for (;;) {
|
||||
if (!len) {
|
||||
if (unlikely(!len)) {
|
||||
_ctr[0] = c0;
|
||||
_ctr[1] = Utils::hton(c1);
|
||||
_len = totalLen;
|
||||
|
@ -508,23 +493,23 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
out[totalLen++] = *(in++);
|
||||
if (!(totalLen & 15U)) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d0 = _mm_xor_si128(d0,k[0]);
|
||||
d0 = _mm_aesenc_si128(d0,k[1]);
|
||||
d0 = _mm_aesenc_si128(d0,k[2]);
|
||||
d0 = _mm_aesenc_si128(d0,k[3]);
|
||||
d0 = _mm_aesenc_si128(d0,k[4]);
|
||||
d0 = _mm_aesenc_si128(d0,k[5]);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k7);
|
||||
d0 = _mm_aesenc_si128(d0,k8);
|
||||
d0 = _mm_aesenc_si128(d0,k9);
|
||||
d0 = _mm_aesenc_si128(d0,k10);
|
||||
d0 = _mm_aesenc_si128(d0,k[7]);
|
||||
d0 = _mm_aesenc_si128(d0,k[8]);
|
||||
d0 = _mm_aesenc_si128(d0,k[9]);
|
||||
d0 = _mm_aesenc_si128(d0,k[10]);
|
||||
__m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16));
|
||||
d0 = _mm_aesenc_si128(d0,k11);
|
||||
d0 = _mm_aesenc_si128(d0,k[11]);
|
||||
const __m128i p0 = _mm_loadu_si128(outblk);
|
||||
d0 = _mm_aesenc_si128(d0,k12);
|
||||
d0 = _mm_aesenc_si128(d0,k13);
|
||||
d0 = _mm_aesenclast_si128(d0,k14);
|
||||
d0 = _mm_aesenc_si128(d0,k[12]);
|
||||
d0 = _mm_aesenc_si128(d0,k[13]);
|
||||
d0 = _mm_aesenclast_si128(d0,k[14]);
|
||||
_mm_storeu_si128(outblk,_mm_xor_si128(p0,d0));
|
||||
break;
|
||||
}
|
||||
|
@ -532,128 +517,236 @@ void AES::CTR::crypt(const void *const input,unsigned int len) noexcept
|
|||
}
|
||||
|
||||
out += totalLen;
|
||||
_len = (totalLen + len);
|
||||
_len = totalLen + len;
|
||||
|
||||
while (len >= 64) {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
||||
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
||||
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
||||
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
||||
c1 += 4;
|
||||
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d1 = _mm_xor_si128(d1,k0);
|
||||
d2 = _mm_xor_si128(d2,k0);
|
||||
d3 = _mm_xor_si128(d3,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d1 = _mm_aesenc_si128(d1,k1);
|
||||
d2 = _mm_aesenc_si128(d2,k1);
|
||||
d3 = _mm_aesenc_si128(d3,k1);
|
||||
__m128i ka = k[6];
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d1 = _mm_aesenc_si128(d1,k2);
|
||||
d2 = _mm_aesenc_si128(d2,k2);
|
||||
d3 = _mm_aesenc_si128(d3,k2);
|
||||
__m128i kb = k[7];
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d1 = _mm_aesenc_si128(d1,k3);
|
||||
d2 = _mm_aesenc_si128(d2,k3);
|
||||
d3 = _mm_aesenc_si128(d3,k3);
|
||||
__m128i kc = k[8];
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d1 = _mm_aesenc_si128(d1,k4);
|
||||
d2 = _mm_aesenc_si128(d2,k4);
|
||||
d3 = _mm_aesenc_si128(d3,k4);
|
||||
__m128i kd = k[9];
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d1 = _mm_aesenc_si128(d1,k5);
|
||||
d2 = _mm_aesenc_si128(d2,k5);
|
||||
d3 = _mm_aesenc_si128(d3,k5);
|
||||
__m128i ke = k[10];
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
__m128i kf = k[11];
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = k[12];
|
||||
d0 = _mm_aesenc_si128(d0,kc);
|
||||
d1 = _mm_aesenc_si128(d1,kc);
|
||||
d2 = _mm_aesenc_si128(d2,kc);
|
||||
d3 = _mm_aesenc_si128(d3,kc);
|
||||
kb = k[13];
|
||||
d0 = _mm_aesenc_si128(d0,kd);
|
||||
d1 = _mm_aesenc_si128(d1,kd);
|
||||
d2 = _mm_aesenc_si128(d2,kd);
|
||||
d3 = _mm_aesenc_si128(d3,kd);
|
||||
kc = k[14];
|
||||
d0 = _mm_aesenc_si128(d0,ke);
|
||||
d1 = _mm_aesenc_si128(d1,ke);
|
||||
d2 = _mm_aesenc_si128(d2,ke);
|
||||
d3 = _mm_aesenc_si128(d3,ke);
|
||||
kd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
d0 = _mm_aesenc_si128(d0,kf);
|
||||
d1 = _mm_aesenc_si128(d1,kf);
|
||||
d2 = _mm_aesenc_si128(d2,kf);
|
||||
d3 = _mm_aesenc_si128(d3,kf);
|
||||
ke = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
d0 = _mm_aesenc_si128(d0,ka);
|
||||
d1 = _mm_aesenc_si128(d1,ka);
|
||||
d2 = _mm_aesenc_si128(d2,ka);
|
||||
d3 = _mm_aesenc_si128(d3,ka);
|
||||
kf = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
d0 = _mm_aesenc_si128(d0,kb);
|
||||
d1 = _mm_aesenc_si128(d1,kb);
|
||||
d2 = _mm_aesenc_si128(d2,kb);
|
||||
d3 = _mm_aesenc_si128(d3,kb);
|
||||
ka = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
d0 = _mm_aesenclast_si128(d0,kc);
|
||||
d1 = _mm_aesenclast_si128(d1,kc);
|
||||
d2 = _mm_aesenclast_si128(d2,kc);
|
||||
d3 = _mm_aesenclast_si128(d3,kc);
|
||||
kd = _mm_xor_si128(d0,kd);
|
||||
ke = _mm_xor_si128(d1,ke);
|
||||
kf = _mm_xor_si128(d2,kf);
|
||||
ka = _mm_xor_si128(d3,ka);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),kd);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),ke);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),kf);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),ka);
|
||||
|
||||
in += 64;
|
||||
len -= 64;
|
||||
out += 64;
|
||||
if (likely(len >= 64)) {
|
||||
if (Utils::CPUID.vaes) { // is only true if AVX is also present
|
||||
if ((!Utils::CPUID.avx512f)||((len < 1024))) {
|
||||
const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]);
|
||||
const __m256i kk1 = _mm256_broadcastsi128_si256(k[1]);
|
||||
const __m256i kk2 = _mm256_broadcastsi128_si256(k[2]);
|
||||
const __m256i kk3 = _mm256_broadcastsi128_si256(k[3]);
|
||||
const __m256i kk4 = _mm256_broadcastsi128_si256(k[4]);
|
||||
const __m256i kk5 = _mm256_broadcastsi128_si256(k[5]);
|
||||
const __m256i kk6 = _mm256_broadcastsi128_si256(k[6]);
|
||||
const __m256i kk7 = _mm256_broadcastsi128_si256(k[7]);
|
||||
const __m256i kk8 = _mm256_broadcastsi128_si256(k[8]);
|
||||
const __m256i kk9 = _mm256_broadcastsi128_si256(k[9]);
|
||||
const __m256i kk10 = _mm256_broadcastsi128_si256(k[10]);
|
||||
const __m256i kk11 = _mm256_broadcastsi128_si256(k[11]);
|
||||
const __m256i kk12 = _mm256_broadcastsi128_si256(k[12]);
|
||||
const __m256i kk13 = _mm256_broadcastsi128_si256(k[13]);
|
||||
const __m256i kk14 = _mm256_broadcastsi128_si256(k[14]);
|
||||
do {
|
||||
__m256i d0 = _mm256_set_epi64x(
|
||||
(long long)Utils::hton(c1 + 1ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1),(long long)c0);
|
||||
__m256i d1 = _mm256_set_epi64x(
|
||||
(long long)Utils::hton(c1 + 3ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
||||
c1 += 4;
|
||||
__m256i p0 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(in));
|
||||
__m256i p1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(in + 32));
|
||||
in += 64;
|
||||
d0 = _mm256_xor_si256(d0,kk0);
|
||||
d1 = _mm256_xor_si256(d1,kk0);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk1);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk1);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk2);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk2);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk3);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk3);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk4);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk4);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk5);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk5);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk6);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk6);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk7);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk7);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk8);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk8);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk9);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk9);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk10);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk10);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk11);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk11);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk12);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk12);
|
||||
d0 = _mm256_aesenc_epi128(d0,kk13);
|
||||
d1 = _mm256_aesenc_epi128(d1,kk13);
|
||||
d0 = _mm256_aesenclast_epi128(d0,kk14);
|
||||
d1 = _mm256_aesenclast_epi128(d1,kk14);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out),_mm256_xor_si256(d0,p0));
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),_mm256_xor_si256(d1,p1));
|
||||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
} else {
|
||||
const __m512i kk0 = _mm512_broadcast_i32x4(k[0]);
|
||||
const __m512i kk1 = _mm512_broadcast_i32x4(k[1]);
|
||||
const __m512i kk2 = _mm512_broadcast_i32x4(k[2]);
|
||||
const __m512i kk3 = _mm512_broadcast_i32x4(k[3]);
|
||||
const __m512i kk4 = _mm512_broadcast_i32x4(k[4]);
|
||||
const __m512i kk5 = _mm512_broadcast_i32x4(k[5]);
|
||||
const __m512i kk6 = _mm512_broadcast_i32x4(k[6]);
|
||||
const __m512i kk7 = _mm512_broadcast_i32x4(k[7]);
|
||||
const __m512i kk8 = _mm512_broadcast_i32x4(k[8]);
|
||||
const __m512i kk9 = _mm512_broadcast_i32x4(k[9]);
|
||||
const __m512i kk10 = _mm512_broadcast_i32x4(k[10]);
|
||||
const __m512i kk11 = _mm512_broadcast_i32x4(k[11]);
|
||||
const __m512i kk12 = _mm512_broadcast_i32x4(k[12]);
|
||||
const __m512i kk13 = _mm512_broadcast_i32x4(k[13]);
|
||||
const __m512i kk14 = _mm512_broadcast_i32x4(k[14]);
|
||||
do {
|
||||
__m512i d0 = _mm512_set_epi64(
|
||||
(long long)Utils::hton(c1 + 3ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 2ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1 + 1ULL),(long long)c0,
|
||||
(long long)Utils::hton(c1),(long long)c0);
|
||||
c1 += 4;
|
||||
__m512i p0 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(in));
|
||||
in += 64;
|
||||
d0 = _mm512_xor_si512(d0,kk0);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk1);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk2);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk3);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk4);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk5);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk6);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk7);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk8);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk9);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk10);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk11);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk12);
|
||||
d0 = _mm512_aesenc_epi128(d0,kk13);
|
||||
d0 = _mm512_aesenclast_epi128(d0,kk14);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i *>(out),_mm512_xor_si512(p0,d0));
|
||||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
}
|
||||
} else {
|
||||
const __m128i k0 = k[0];
|
||||
const __m128i k1 = k[1];
|
||||
const __m128i k2 = k[2];
|
||||
const __m128i k3 = k[3];
|
||||
const __m128i k4 = k[4];
|
||||
const __m128i k5 = k[5];
|
||||
const __m128i k6 = k[6];
|
||||
const __m128i k7 = k[7];
|
||||
const __m128i k8 = k[8];
|
||||
const __m128i k9 = k[9];
|
||||
const __m128i k10 = k[10];
|
||||
const __m128i k11 = k[11];
|
||||
const __m128i k12 = k[12];
|
||||
const __m128i k13 = k[13];
|
||||
const __m128i k14 = k[14];
|
||||
do {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1),(long long)c0);
|
||||
__m128i d1 = _mm_set_epi64x((long long)Utils::hton(c1 + 1ULL),(long long)c0);
|
||||
__m128i d2 = _mm_set_epi64x((long long)Utils::hton(c1 + 2ULL),(long long)c0);
|
||||
__m128i d3 = _mm_set_epi64x((long long)Utils::hton(c1 + 3ULL),(long long)c0);
|
||||
c1 += 4;
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d1 = _mm_xor_si128(d1,k0);
|
||||
d2 = _mm_xor_si128(d2,k0);
|
||||
d3 = _mm_xor_si128(d3,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d1 = _mm_aesenc_si128(d1,k1);
|
||||
d2 = _mm_aesenc_si128(d2,k1);
|
||||
d3 = _mm_aesenc_si128(d3,k1);
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d1 = _mm_aesenc_si128(d1,k2);
|
||||
d2 = _mm_aesenc_si128(d2,k2);
|
||||
d3 = _mm_aesenc_si128(d3,k2);
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d1 = _mm_aesenc_si128(d1,k3);
|
||||
d2 = _mm_aesenc_si128(d2,k3);
|
||||
d3 = _mm_aesenc_si128(d3,k3);
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d1 = _mm_aesenc_si128(d1,k4);
|
||||
d2 = _mm_aesenc_si128(d2,k4);
|
||||
d3 = _mm_aesenc_si128(d3,k4);
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d1 = _mm_aesenc_si128(d1,k5);
|
||||
d2 = _mm_aesenc_si128(d2,k5);
|
||||
d3 = _mm_aesenc_si128(d3,k5);
|
||||
d0 = _mm_aesenc_si128(d0,k6);
|
||||
d1 = _mm_aesenc_si128(d1,k6);
|
||||
d2 = _mm_aesenc_si128(d2,k6);
|
||||
d3 = _mm_aesenc_si128(d3,k6);
|
||||
d0 = _mm_aesenc_si128(d0,k7);
|
||||
d1 = _mm_aesenc_si128(d1,k7);
|
||||
d2 = _mm_aesenc_si128(d2,k7);
|
||||
d3 = _mm_aesenc_si128(d3,k7);
|
||||
d0 = _mm_aesenc_si128(d0,k8);
|
||||
d1 = _mm_aesenc_si128(d1,k8);
|
||||
d2 = _mm_aesenc_si128(d2,k8);
|
||||
d3 = _mm_aesenc_si128(d3,k8);
|
||||
d0 = _mm_aesenc_si128(d0,k9);
|
||||
d1 = _mm_aesenc_si128(d1,k9);
|
||||
d2 = _mm_aesenc_si128(d2,k9);
|
||||
d3 = _mm_aesenc_si128(d3,k9);
|
||||
d0 = _mm_aesenc_si128(d0,k10);
|
||||
d1 = _mm_aesenc_si128(d1,k10);
|
||||
d2 = _mm_aesenc_si128(d2,k10);
|
||||
d3 = _mm_aesenc_si128(d3,k10);
|
||||
__m128i p0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in));
|
||||
d0 = _mm_aesenc_si128(d0,k11);
|
||||
d1 = _mm_aesenc_si128(d1,k11);
|
||||
d2 = _mm_aesenc_si128(d2,k11);
|
||||
d3 = _mm_aesenc_si128(d3,k11);
|
||||
__m128i p1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 16));
|
||||
d0 = _mm_aesenc_si128(d0,k12);
|
||||
d1 = _mm_aesenc_si128(d1,k12);
|
||||
d2 = _mm_aesenc_si128(d2,k12);
|
||||
d3 = _mm_aesenc_si128(d3,k12);
|
||||
__m128i p2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 32));
|
||||
d0 = _mm_aesenc_si128(d0,k13);
|
||||
d1 = _mm_aesenc_si128(d1,k13);
|
||||
d2 = _mm_aesenc_si128(d2,k13);
|
||||
d3 = _mm_aesenc_si128(d3,k13);
|
||||
__m128i p3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + 48));
|
||||
in += 64;
|
||||
d0 = _mm_aesenclast_si128(d0,k14);
|
||||
d1 = _mm_aesenclast_si128(d1,k14);
|
||||
d2 = _mm_aesenclast_si128(d2,k14);
|
||||
d3 = _mm_aesenclast_si128(d3,k14);
|
||||
p0 = _mm_xor_si128(d0,p0);
|
||||
p1 = _mm_xor_si128(d1,p1);
|
||||
p2 = _mm_xor_si128(d2,p2);
|
||||
p3 = _mm_xor_si128(d3,p3);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),p0);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16),p1);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32),p2);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48),p3);
|
||||
out += 64;
|
||||
len -= 64;
|
||||
} while (len >= 64);
|
||||
}
|
||||
}
|
||||
|
||||
if (len >= 16) {
|
||||
const __m128i k7 = k[7];
|
||||
const __m128i k8 = k[8];
|
||||
const __m128i k9 = k[9];
|
||||
const __m128i k10 = k[10];
|
||||
const __m128i k11 = k[11];
|
||||
const __m128i k12 = k[12];
|
||||
const __m128i k13 = k[13];
|
||||
const __m128i k14 = k[14];
|
||||
do {
|
||||
__m128i d0 = _mm_set_epi64x((long long)Utils::hton(c1++),(long long)c0);
|
||||
d0 = _mm_xor_si128(d0,k0);
|
||||
d0 = _mm_aesenc_si128(d0,k1);
|
||||
d0 = _mm_aesenc_si128(d0,k2);
|
||||
d0 = _mm_aesenc_si128(d0,k3);
|
||||
d0 = _mm_aesenc_si128(d0,k4);
|
||||
d0 = _mm_aesenc_si128(d0,k5);
|
||||
d0 = _mm_xor_si128(d0,k[0]);
|
||||
d0 = _mm_aesenc_si128(d0,k[1]);
|
||||
d0 = _mm_aesenc_si128(d0,k[2]);
|
||||
d0 = _mm_aesenc_si128(d0,k[3]);
|
||||
d0 = _mm_aesenc_si128(d0,k[4]);
|
||||
d0 = _mm_aesenc_si128(d0,k[5]);
|
||||
d0 = _mm_aesenc_si128(d0,k[6]);
|
||||
d0 = _mm_aesenc_si128(d0,k7);
|
||||
d0 = _mm_aesenc_si128(d0,k8);
|
||||
d0 = _mm_aesenc_si128(d0,k9);
|
||||
d0 = _mm_aesenc_si128(d0,k10);
|
||||
d0 = _mm_aesenc_si128(d0,k11);
|
||||
d0 = _mm_aesenc_si128(d0,k12);
|
||||
d0 = _mm_aesenc_si128(d0,k13);
|
||||
d0 = _mm_aesenclast_si128(d0,k14);
|
||||
d0 = _mm_aesenc_si128(d0,k[7]);
|
||||
d0 = _mm_aesenc_si128(d0,k[8]);
|
||||
d0 = _mm_aesenc_si128(d0,k[9]);
|
||||
d0 = _mm_aesenc_si128(d0,k[10]);
|
||||
d0 = _mm_aesenc_si128(d0,k[11]);
|
||||
d0 = _mm_aesenc_si128(d0,k[12]);
|
||||
d0 = _mm_aesenc_si128(d0,k[13]);
|
||||
d0 = _mm_aesenclast_si128(d0,k[14]);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(out),_mm_xor_si128(d0,_mm_loadu_si128(reinterpret_cast<const __m128i *>(in))));
|
||||
in += 16;
|
||||
len -= 16;
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "Identity.hpp"
|
||||
#include "SHA512.hpp"
|
||||
#include "Salsa20.hpp"
|
||||
#include "Poly1305.hpp"
|
||||
#include "Utils.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -39,14 +40,14 @@ void identityV0ProofOfWorkFrankenhash(const void *const publicKey, unsigned int
|
|||
s20.crypt20((char *) genmem, (char *) genmem, 64);
|
||||
for (unsigned long i = 64;i < ZT_V0_IDENTITY_GEN_MEMORY;i += 64) {
|
||||
unsigned long k = i - 64;
|
||||
*((uint64_t *) ((char *) genmem + i)) = *((uint64_t *) ((char *) genmem + k));
|
||||
*((uint64_t *) ((char *) genmem + i + 8)) = *((uint64_t *) ((char *) genmem + k + 8));
|
||||
*((uint64_t *) ((char *) genmem + i + 16)) = *((uint64_t *) ((char *) genmem + k + 16));
|
||||
*((uint64_t *) ((char *) genmem + i + 24)) = *((uint64_t *) ((char *) genmem + k + 24));
|
||||
*((uint64_t *) ((char *) genmem + i + 32)) = *((uint64_t *) ((char *) genmem + k + 32));
|
||||
*((uint64_t *) ((char *) genmem + i + 40)) = *((uint64_t *) ((char *) genmem + k + 40));
|
||||
*((uint64_t *) ((char *) genmem + i + 48)) = *((uint64_t *) ((char *) genmem + k + 48));
|
||||
*((uint64_t *) ((char *) genmem + i + 56)) = *((uint64_t *) ((char *) genmem + k + 56));
|
||||
*((uint64_t * )((char *) genmem + i)) = *((uint64_t * )((char *) genmem + k));
|
||||
*((uint64_t * )((char *) genmem + i + 8)) = *((uint64_t * )((char *) genmem + k + 8));
|
||||
*((uint64_t * )((char *) genmem + i + 16)) = *((uint64_t * )((char *) genmem + k + 16));
|
||||
*((uint64_t * )((char *) genmem + i + 24)) = *((uint64_t * )((char *) genmem + k + 24));
|
||||
*((uint64_t * )((char *) genmem + i + 32)) = *((uint64_t * )((char *) genmem + k + 32));
|
||||
*((uint64_t * )((char *) genmem + i + 40)) = *((uint64_t * )((char *) genmem + k + 40));
|
||||
*((uint64_t * )((char *) genmem + i + 48)) = *((uint64_t * )((char *) genmem + k + 48));
|
||||
*((uint64_t * )((char *) genmem + i + 56)) = *((uint64_t * )((char *) genmem + k + 56));
|
||||
s20.crypt20((char *) genmem + i, (char *) genmem + i, 64);
|
||||
}
|
||||
|
||||
|
@ -78,49 +79,58 @@ struct identityV0ProofOfWorkCriteria
|
|||
|
||||
#define ZT_IDENTITY_V1_POW_MEMORY_SIZE 131072
|
||||
|
||||
// This is a simpler memory-intensive hash function for V1 identity generation.
|
||||
// It's not quite as heavy as the V0 frankenhash, is a little more orderly in
|
||||
// its design, but remains relatively resistant to GPU acceleration due to memory
|
||||
// requirements for efficient computation.
|
||||
struct p_CompareLittleEndian
|
||||
{
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
ZT_INLINE bool operator()(const uint64_t a,const uint64_t b) const noexcept { return Utils::swapBytes(a) < Utils::swapBytes(b); }
|
||||
#else
|
||||
ZT_INLINE bool operator()(const uint64_t a,const uint64_t b) const noexcept { return a < b; }
|
||||
#endif
|
||||
};
|
||||
|
||||
// This is a simpler memory-intensive frankenhash for V1 identity generation.
|
||||
bool identityV1ProofOfWorkCriteria(const void *in, const unsigned int len)
|
||||
{
|
||||
uint64_t b[ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8];
|
||||
uint64_t w[ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8];
|
||||
|
||||
SHA384(b, in, len);
|
||||
Utils::zero<ZT_IDENTITY_V1_POW_MEMORY_SIZE - 48>(b + 6);
|
||||
Salsa20(b,b + 4).crypt12(b,b,ZT_IDENTITY_V1_POW_MEMORY_SIZE);
|
||||
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
for (unsigned int i=0;i<(ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8);) {
|
||||
const unsigned int i1 = i + 1;
|
||||
const unsigned int i2 = i + 2;
|
||||
const unsigned int i3 = i + 3;
|
||||
b[i] = Utils::swapBytes(b[i]);
|
||||
i += 4;
|
||||
b[i1] = Utils::swapBytes(b[i1]);
|
||||
b[i2] = Utils::swapBytes(b[i2]);
|
||||
b[i3] = Utils::swapBytes(b[i3]);
|
||||
// Fill work buffer with pseudorandom bytes using a construction that should be
|
||||
// relatively hostile to GPU acceleration. GPUs usually implement branching by
|
||||
// executing all branches and then selecting the answer, which means this
|
||||
// construction should require a GPU to do ~3X the work of a CPU per iteration.
|
||||
SHA512(w, in, len);
|
||||
for (unsigned int i = 8, j = 0;i < (ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8);) {
|
||||
uint64_t *const ww = w + i;
|
||||
const uint64_t *const wp = w + j;
|
||||
i += 8;
|
||||
j += 8;
|
||||
if ((wp[0] & 7U) == 0) {
|
||||
SHA512(ww, wp, 64);
|
||||
} else if ((wp[1] & 15U) == 0) {
|
||||
ww[0] = Utils::hton(Utils::ntoh(wp[0]) % 4503599627370101ULL);
|
||||
ww[1] = Utils::hton(Utils::ntoh(wp[1]) % 4503599627370161ULL);
|
||||
ww[2] = Utils::hton(Utils::ntoh(wp[2]) % 4503599627370227ULL);
|
||||
ww[3] = Utils::hton(Utils::ntoh(wp[3]) % 4503599627370287ULL);
|
||||
ww[4] = Utils::hton(Utils::ntoh(wp[4]) % 4503599627370299ULL);
|
||||
ww[5] = Utils::hton(Utils::ntoh(wp[5]) % 4503599627370323ULL);
|
||||
ww[6] = Utils::hton(Utils::ntoh(wp[6]) % 4503599627370353ULL);
|
||||
ww[7] = Utils::hton(Utils::ntoh(wp[7]) % 4503599627370449ULL);
|
||||
SHA384(ww, wp, 128);
|
||||
} else {
|
||||
Salsa20(wp, wp + 4).crypt12(wp, ww, 64);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
std::sort(b,b + (ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8));
|
||||
// Sort 64-bit integers (little-endian) into ascending order and compute a
|
||||
// cryptographic checksum. Sorting makes the order of values dependent on all
|
||||
// other values, making a speed competitive implementation that skips on the
|
||||
// memory requirement extremely hard.
|
||||
std::sort(w, w + (ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8), p_CompareLittleEndian());
|
||||
Poly1305::compute(w, w, ZT_IDENTITY_V1_POW_MEMORY_SIZE, w);
|
||||
|
||||
#if __BYTE_ORDER == __BIG_ENDIAN
|
||||
for (unsigned int i=0;i<(ZT_IDENTITY_V1_POW_MEMORY_SIZE / 8);) {
|
||||
const unsigned int i1 = i + 1;
|
||||
const unsigned int i2 = i + 2;
|
||||
const unsigned int i3 = i + 3;
|
||||
b[i] = Utils::swapBytes(b[i]);
|
||||
i += 4;
|
||||
b[i1] = Utils::swapBytes(b[i1]);
|
||||
b[i2] = Utils::swapBytes(b[i2]);
|
||||
b[i3] = Utils::swapBytes(b[i3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
SHA384(b, b, ZT_IDENTITY_V1_POW_MEMORY_SIZE, in, len);
|
||||
|
||||
return (b[0] % 1093U) == 0;
|
||||
// PoW criteria passed if this is true. The value 593 was chosen experimentally
|
||||
// to yield a good average performance balancing fast setup with intentional
|
||||
// identity collision resistance.
|
||||
return (Utils::ntoh(w[0]) % 593U) == 0;
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
@ -145,7 +155,7 @@ bool Identity::generate(const Type t)
|
|||
address.setTo(digest + 59);
|
||||
} while (address.isReserved());
|
||||
delete[] genmem;
|
||||
m_fp.m_cfp.address = address.toInt();
|
||||
m_fp.m_cfp.address = address.toInt(); // address comes from PoW hash for type 0 identities
|
||||
m_computeHash();
|
||||
} break;
|
||||
|
||||
|
@ -167,6 +177,7 @@ bool Identity::generate(const Type t)
|
|||
// If we passed PoW then check that the address is valid, otherwise loop
|
||||
// back around and run the whole process again.
|
||||
m_computeHash();
|
||||
m_fp.m_cfp.address = Address(m_fp.m_cfp.hash).toInt();
|
||||
if (!m_fp.address().isReserved())
|
||||
break;
|
||||
}
|
||||
|
@ -351,10 +362,8 @@ bool Identity::fromString(const char *str)
|
|||
|
||||
case 0:
|
||||
m_fp.m_cfp.address = Utils::hexStrToU64(f) & ZT_ADDRESS_MASK;
|
||||
if (m_fp.address().isReserved()) {
|
||||
memoryZero(this);
|
||||
if (m_fp.address().isReserved())
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
case 1:
|
||||
|
@ -363,7 +372,6 @@ bool Identity::fromString(const char *str)
|
|||
} else if ((f[0] == '1') && (!f[1])) {
|
||||
m_type = P384;
|
||||
} else {
|
||||
memoryZero(this);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
@ -372,17 +380,13 @@ bool Identity::fromString(const char *str)
|
|||
switch (m_type) {
|
||||
|
||||
case C25519:
|
||||
if (Utils::unhex(f, strlen(f), m_pub, ZT_C25519_COMBINED_PUBLIC_KEY_SIZE) != ZT_C25519_COMBINED_PUBLIC_KEY_SIZE) {
|
||||
memoryZero(this);
|
||||
if (Utils::unhex(f, strlen(f), m_pub, ZT_C25519_COMBINED_PUBLIC_KEY_SIZE) != ZT_C25519_COMBINED_PUBLIC_KEY_SIZE)
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
case P384:
|
||||
if (Utils::b32d(f, m_pub, sizeof(m_pub)) != sizeof(m_pub)) {
|
||||
memoryZero(this);
|
||||
if (Utils::b32d(f, m_pub, sizeof(m_pub)) != sizeof(m_pub))
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
|
@ -394,7 +398,6 @@ bool Identity::fromString(const char *str)
|
|||
|
||||
case C25519:
|
||||
if (Utils::unhex(f, strlen(f), m_priv, ZT_C25519_COMBINED_PRIVATE_KEY_SIZE) != ZT_C25519_COMBINED_PRIVATE_KEY_SIZE) {
|
||||
memoryZero(this);
|
||||
return false;
|
||||
} else {
|
||||
m_hasPrivate = true;
|
||||
|
@ -403,7 +406,6 @@ bool Identity::fromString(const char *str)
|
|||
|
||||
case P384:
|
||||
if (Utils::b32d(f, m_priv, sizeof(m_priv)) != sizeof(m_priv)) {
|
||||
memoryZero(this);
|
||||
return false;
|
||||
} else {
|
||||
m_hasPrivate = true;
|
||||
|
@ -417,16 +419,12 @@ bool Identity::fromString(const char *str)
|
|||
}
|
||||
}
|
||||
|
||||
if (fno < 3) {
|
||||
memoryZero(this);
|
||||
if (fno < 3)
|
||||
return false;
|
||||
}
|
||||
|
||||
m_computeHash();
|
||||
if ((m_type == P384) && (m_fp.address() != Address(m_fp.hash()))) {
|
||||
memoryZero(this);
|
||||
if ((m_type == P384) && (m_fp.address() != Address(m_fp.hash())))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -533,7 +531,6 @@ void Identity::m_computeHash()
|
|||
break;
|
||||
case P384:
|
||||
SHA384(m_fp.m_cfp.hash, m_pub, sizeof(m_pub));
|
||||
m_fp.m_cfp.address = Address(m_fp.m_cfp.hash).toInt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,7 +100,11 @@
|
|||
|
||||
#if (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64))
|
||||
#define ZT_ARCH_X64 1
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(ZT_ARCH_X64) || defined(i386) || defined(__i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_) || defined(__I86__) || defined(__INTEL__) || defined(__386)
|
||||
#define ZT_ARCH_X86 1
|
||||
#endif
|
||||
|
|
|
@ -176,7 +176,7 @@ static const C25519TestVector C25519_TEST_VECTORS[ZT_NUM_C25519_TEST_VECTORS] =
|
|||
};
|
||||
|
||||
#define IDENTITY_V0_KNOWN_GOOD_0 "8e4df28b72:0:ac3d46abe0c21f3cfe7a6c8d6a85cfcffcb82fbd55af6a4d6350657c68200843fa2e16f9418bbd9702cae365f2af5fb4c420908b803a681d4daef6114d78a2d7:bd8dd6e4ce7022d2f812797a80c6ee8ad180dc4ebf301dec8b06d1be08832bddd63a2f1cfa7b2c504474c75bdc8898ba476ef92e8e2d0509f8441985171ff16e"
|
||||
#define IDENTITY_V1_KNOWN_GOOD_0 "2d48f7a238:1:gltupn4yrt226o3vebl7m7m5hpndhvfz66nzx6gwgtgbsgs5xr7dpz5aiv636zijrxayuu2ydpff4zgho7o6gpvx62njwkavqordxcceajs2fif4y2ytofpyr25mmxmanbf4fmdiitiq2b53nmx4ckjcmtyqrkqye2jkdainmkqbtil3dhyuiwa:xg73bkrxptymo7kyyd6efu2o7ziemyu3lpgtip53ejsqukt6l2gebq5uofzt6cd2455st5iwrdgc2ft3twkdzrkunu6x5imdz6jt27qopsvqpdijx5cqgukpjxrtyx73j42socym5pi5hy2ir5yma7by4gmtjgvvu3sxbb3qv2yuicykyz2q"
|
||||
#define IDENTITY_V1_KNOWN_GOOD_0 "b0c2badfeb:1:sueysfvujydbkwykbdfemkm5cjgpezjdrzvfczmmfwd2i2ffrrasybhqkz5xegfrrumoidwqyuovprplysmbhtmkim2whjvivub5tcubakzzkhejhqsaiajcu3eooywx3r7sxyflok7b4lgwjv4qqeahkhh4uwog6ke3yqaie2jp3b4wf2pvo2y:xwfmcy2ptfocxnldnkdhzgo4xj73peve3c4ijnlnr442boef7xin34huerixeoes6jsq5g26rvtngjmhqopim7jxssfkw57z2vxidxkutcr4jzu7mmjpnvixwvmbo26nfbd3albf3fyfzi3py6o4bzcnh7thskzvuks5adscqjnseoajjdka"
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
@ -695,10 +695,7 @@ extern "C" const char *ZTT_general()
|
|||
|
||||
Utils::scopy(tmp,sizeof(tmp),IDENTITY_V1_KNOWN_GOOD_0);
|
||||
tmp[0] = '0';
|
||||
if (id.fromString(tmp)) {
|
||||
ZT_T_PRINTF("FAILED (parse of known-bad identity returned ok)" ZT_EOL_S);
|
||||
return "Identity test failed: parse of known-bad identity";
|
||||
}
|
||||
id.fromString(tmp);
|
||||
if (id.locallyValidate()) {
|
||||
ZT_T_PRINTF("FAILED (validation of known-bad identity returned ok)" ZT_EOL_S);
|
||||
return "Identity test failed: validation of known-bad identity";
|
||||
|
|
|
@ -35,6 +35,7 @@ namespace Utils {
|
|||
#ifdef ZT_ARCH_X64
|
||||
CPUIDRegisters::CPUIDRegisters() noexcept
|
||||
{
|
||||
uint32_t eax,ebx,ecx,edx;
|
||||
#ifdef __WINDOWS__
|
||||
int regs[4];
|
||||
__cpuid(regs,1);
|
||||
|
@ -50,7 +51,23 @@ CPUIDRegisters::CPUIDRegisters() noexcept
|
|||
);
|
||||
#endif
|
||||
rdrand = ((ecx & (1U << 30U)) != 0);
|
||||
aes = ( ((ecx & (1U << 25U)) != 0) && ((ecx & (1U << 19U)) != 0) && ((ecx & (1U << 1U)) != 0) ); // AES, PCLMUL, SSE4.1
|
||||
aes = ( ((ecx & (1U << 25U)) != 0) && ((ecx & (1U << 19U)) != 0) && ((ecx & (1U << 1U)) != 0) );
|
||||
avx = ((ecx & (1U << 25U)) != 0);
|
||||
#ifdef __WINDOWS__
|
||||
TODO
|
||||
#else
|
||||
__asm__ __volatile__ (
|
||||
"cpuid"
|
||||
: "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx)
|
||||
: "a"(7),"c"(0)
|
||||
);
|
||||
#endif
|
||||
vaes = aes && avx && ((ecx & (1U << 9U)) != 0);
|
||||
vpclmulqdq = aes && avx && ((ecx & (1U << 10U)) != 0);
|
||||
avx2 = avx && ((ebx & (1U << 5U)) != 0);
|
||||
avx512f = avx && ((ebx & (1U << 16U)) != 0);
|
||||
sha = ((ebx & (1U << 29U)) != 0);
|
||||
fsrm = sha = ((edx & (1U << 4U)) != 0);
|
||||
}
|
||||
const CPUIDRegisters CPUID;
|
||||
#endif
|
||||
|
|
|
@ -16,12 +16,6 @@
|
|||
|
||||
#include "Constants.hpp"
|
||||
|
||||
#ifdef ZT_ARCH_X64
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
@ -60,9 +54,15 @@ namespace Utils {
|
|||
struct CPUIDRegisters
|
||||
{
|
||||
CPUIDRegisters() noexcept;
|
||||
uint32_t eax,ebx,ecx,edx;
|
||||
bool rdrand;
|
||||
bool aes;
|
||||
bool avx;
|
||||
bool vaes; // implies AVX
|
||||
bool vpclmulqdq; // implies AVX
|
||||
bool avx2;
|
||||
bool avx512f;
|
||||
bool sha;
|
||||
bool fsrm;
|
||||
};
|
||||
extern const CPUIDRegisters CPUID;
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue