More AES stuff

This commit is contained in:
Adam Ierymenko 2019-08-12 15:37:50 -07:00
parent f5e71f64ed
commit f7bc9f01c9
No known key found for this signature in database
GPG key ID: 1657198823E52A61

View file

@ -97,6 +97,13 @@ public:
inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
{
#ifdef ZT_AES_AESNI
if (HW_ACCEL) {
uint8_t tagbuf[16];
_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
return Utils::secureEq(tagbuf,tag,taglen);
}
#endif
abort(); // TODO: software
return false;
}
@ -217,41 +224,41 @@ private:
{
__m128i t1,t2,t3,t4,t5,t6;
y = _swap128_aesni(y);
t1 = _mm_clmulepi64_si128(h, y, 0x00);
t2 = _mm_clmulepi64_si128(h, y, 0x01);
t3 = _mm_clmulepi64_si128(h, y, 0x10);
t4 = _mm_clmulepi64_si128(h, y, 0x11);
t2 = _mm_xor_si128(t2, t3);
t3 = _mm_slli_si128(t2, 8);
t2 = _mm_srli_si128(t2, 8);
t1 = _mm_xor_si128(t1, t3);
t4 = _mm_xor_si128(t4, t2);
t5 = _mm_srli_epi32(t1, 31);
t1 = _mm_slli_epi32(t1, 1);
t6 = _mm_srli_epi32(t4, 31);
t4 = _mm_slli_epi32(t4, 1);
t3 = _mm_srli_si128(t5, 12);
t6 = _mm_slli_si128(t6, 4);
t5 = _mm_slli_si128(t5, 4);
t1 = _mm_or_si128(t1, t5);
t4 = _mm_or_si128(t4, t6);
t4 = _mm_or_si128(t4, t3);
t5 = _mm_slli_epi32(t1, 31);
t6 = _mm_slli_epi32(t1, 30);
t3 = _mm_slli_epi32(t1, 25);
t5 = _mm_xor_si128(t5, t6);
t5 = _mm_xor_si128(t5, t3);
t6 = _mm_srli_si128(t5, 4);
t4 = _mm_xor_si128(t4, t6);
t5 = _mm_slli_si128(t5, 12);
t1 = _mm_xor_si128(t1, t5);
t4 = _mm_xor_si128(t4, t1);
t5 = _mm_srli_epi32(t1, 1);
t2 = _mm_srli_epi32(t1, 2);
t3 = _mm_srli_epi32(t1, 7);
t4 = _mm_xor_si128(t4, t2);
t4 = _mm_xor_si128(t4, t3);
t4 = _mm_xor_si128(t4, t5);
t1 = _mm_clmulepi64_si128(h,y,0x00);
t2 = _mm_clmulepi64_si128(h,y,0x01);
t3 = _mm_clmulepi64_si128(h,y,0x10);
t4 = _mm_clmulepi64_si128(h,y,0x11);
t2 = _mm_xor_si128(t2,t3);
t3 = _mm_slli_si128(t2,8);
t2 = _mm_srli_si128(t2,8);
t1 = _mm_xor_si128(t1,t3);
t4 = _mm_xor_si128(t4,t2);
t5 = _mm_srli_epi32(t1,31);
t1 = _mm_slli_epi32(t1,1);
t6 = _mm_srli_epi32(t4,31);
t4 = _mm_slli_epi32(t4,1);
t3 = _mm_srli_si128(t5,12);
t6 = _mm_slli_si128(t6,4);
t5 = _mm_slli_si128(t5,4);
t1 = _mm_or_si128(t1,t5);
t4 = _mm_or_si128(t4,t6);
t4 = _mm_or_si128(t4,t3);
t5 = _mm_slli_epi32(t1,31);
t6 = _mm_slli_epi32(t1,30);
t3 = _mm_slli_epi32(t1,25);
t5 = _mm_xor_si128(t5,t6);
t5 = _mm_xor_si128(t5,t3);
t6 = _mm_srli_si128(t5,4);
t4 = _mm_xor_si128(t4,t6);
t5 = _mm_slli_si128(t5,12);
t1 = _mm_xor_si128(t1,t5);
t4 = _mm_xor_si128(t4,t1);
t5 = _mm_srli_epi32(t1,1);
t2 = _mm_srli_epi32(t1,2);
t3 = _mm_srli_epi32(t1,7);
t4 = _mm_xor_si128(t4,t2);
t4 = _mm_xor_si128(t4,t3);
t4 = _mm_xor_si128(t4,t5);
return _swap128_aesni(t4);
}
static inline __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4)
@ -261,75 +268,75 @@ private:
d2 = _swap128_aesni(d2);
d3 = _swap128_aesni(d3);
d4 = _swap128_aesni(d4);
t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
t8 = _mm_xor_si128(t0, t1);
t8 = _mm_xor_si128(t8, t2);
t8 = _mm_xor_si128(t8, t3);
t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
t9 = _mm_xor_si128(t4, t5);
t9 = _mm_xor_si128(t9, t6);
t9 = _mm_xor_si128(t9, t7);
t0 = _mm_shuffle_epi32(h1, 78);
t4 = _mm_shuffle_epi32(d1, 78);
t0 = _mm_xor_si128(t0, h1);
t4 = _mm_xor_si128(t4, d1);
t1 = _mm_shuffle_epi32(h2, 78);
t5 = _mm_shuffle_epi32(d2, 78);
t1 = _mm_xor_si128(t1, h2);
t5 = _mm_xor_si128(t5, d2);
t2 = _mm_shuffle_epi32(h3, 78);
t6 = _mm_shuffle_epi32(d3, 78);
t2 = _mm_xor_si128(t2, h3);
t6 = _mm_xor_si128(t6, d3);
t3 = _mm_shuffle_epi32(h4, 78);
t7 = _mm_shuffle_epi32(d4, 78);
t3 = _mm_xor_si128(t3, h4);
t7 = _mm_xor_si128(t7, d4);
t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
t0 = _mm_xor_si128(t0, t8);
t0 = _mm_xor_si128(t0, t9);
t0 = _mm_xor_si128(t1, t0);
t0 = _mm_xor_si128(t2, t0);
t0 = _mm_xor_si128(t3, t0);
t4 = _mm_slli_si128(t0, 8);
t0 = _mm_srli_si128(t0, 8);
t3 = _mm_xor_si128(t4, t8);
t6 = _mm_xor_si128(t0, t9);
t7 = _mm_srli_epi32(t3, 31);
t8 = _mm_srli_epi32(t6, 31);
t3 = _mm_slli_epi32(t3, 1);
t6 = _mm_slli_epi32(t6, 1);
t9 = _mm_srli_si128(t7, 12);
t8 = _mm_slli_si128(t8, 4);
t7 = _mm_slli_si128(t7, 4);
t3 = _mm_or_si128(t3, t7);
t6 = _mm_or_si128(t6, t8);
t6 = _mm_or_si128(t6, t9);
t7 = _mm_slli_epi32(t3, 31);
t8 = _mm_slli_epi32(t3, 30);
t9 = _mm_slli_epi32(t3, 25);
t7 = _mm_xor_si128(t7, t8);
t7 = _mm_xor_si128(t7, t9);
t8 = _mm_srli_si128(t7, 4);
t7 = _mm_slli_si128(t7, 12);
t3 = _mm_xor_si128(t3, t7);
t2 = _mm_srli_epi32(t3, 1);
t4 = _mm_srli_epi32(t3, 2);
t5 = _mm_srli_epi32(t3, 7);
t2 = _mm_xor_si128(t2, t4);
t2 = _mm_xor_si128(t2, t5);
t2 = _mm_xor_si128(t2, t8);
t3 = _mm_xor_si128(t3, t2);
t6 = _mm_xor_si128(t6, t3);
t0 = _mm_clmulepi64_si128(h1,d1,0x00);
t1 = _mm_clmulepi64_si128(h2,d2,0x00);
t2 = _mm_clmulepi64_si128(h3,d3,0x00);
t3 = _mm_clmulepi64_si128(h4,d4,0x00);
t8 = _mm_xor_si128(t0,t1);
t8 = _mm_xor_si128(t8,t2);
t8 = _mm_xor_si128(t8,t3);
t4 = _mm_clmulepi64_si128(h1,d1,0x11);
t5 = _mm_clmulepi64_si128(h2,d2,0x11);
t6 = _mm_clmulepi64_si128(h3,d3,0x11);
t7 = _mm_clmulepi64_si128(h4,d4,0x11);
t9 = _mm_xor_si128(t4,t5);
t9 = _mm_xor_si128(t9,t6);
t9 = _mm_xor_si128(t9,t7);
t0 = _mm_shuffle_epi32(h1,78);
t4 = _mm_shuffle_epi32(d1,78);
t0 = _mm_xor_si128(t0,h1);
t4 = _mm_xor_si128(t4,d1);
t1 = _mm_shuffle_epi32(h2,78);
t5 = _mm_shuffle_epi32(d2,78);
t1 = _mm_xor_si128(t1,h2);
t5 = _mm_xor_si128(t5,d2);
t2 = _mm_shuffle_epi32(h3,78);
t6 = _mm_shuffle_epi32(d3,78);
t2 = _mm_xor_si128(t2,h3);
t6 = _mm_xor_si128(t6,d3);
t3 = _mm_shuffle_epi32(h4,78);
t7 = _mm_shuffle_epi32(d4,78);
t3 = _mm_xor_si128(t3,h4);
t7 = _mm_xor_si128(t7,d4);
t0 = _mm_clmulepi64_si128(t0,t4,0x00);
t1 = _mm_clmulepi64_si128(t1,t5,0x00);
t2 = _mm_clmulepi64_si128(t2,t6,0x00);
t3 = _mm_clmulepi64_si128(t3,t7,0x00);
t0 = _mm_xor_si128(t0,t8);
t0 = _mm_xor_si128(t0,t9);
t0 = _mm_xor_si128(t1,t0);
t0 = _mm_xor_si128(t2,t0);
t0 = _mm_xor_si128(t3,t0);
t4 = _mm_slli_si128(t0,8);
t0 = _mm_srli_si128(t0,8);
t3 = _mm_xor_si128(t4,t8);
t6 = _mm_xor_si128(t0,t9);
t7 = _mm_srli_epi32(t3,31);
t8 = _mm_srli_epi32(t6,31);
t3 = _mm_slli_epi32(t3,1);
t6 = _mm_slli_epi32(t6,1);
t9 = _mm_srli_si128(t7,12);
t8 = _mm_slli_si128(t8,4);
t7 = _mm_slli_si128(t7,4);
t3 = _mm_or_si128(t3,t7);
t6 = _mm_or_si128(t6,t8);
t6 = _mm_or_si128(t6,t9);
t7 = _mm_slli_epi32(t3,31);
t8 = _mm_slli_epi32(t3,30);
t9 = _mm_slli_epi32(t3,25);
t7 = _mm_xor_si128(t7,t8);
t7 = _mm_xor_si128(t7,t9);
t8 = _mm_srli_si128(t7,4);
t7 = _mm_slli_si128(t7,12);
t3 = _mm_xor_si128(t3,t7);
t2 = _mm_srli_epi32(t3,1);
t4 = _mm_srli_epi32(t3,2);
t5 = _mm_srli_epi32(t3,7);
t2 = _mm_xor_si128(t2,t4);
t2 = _mm_xor_si128(t2,t5);
t2 = _mm_xor_si128(t2,t8);
t3 = _mm_xor_si128(t3,t2);
t6 = _mm_xor_si128(t6,t3);
return _swap128_aesni(t6);
}
static inline __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); }
@ -375,13 +382,13 @@ private:
d3 = _mm_loadu_si128(ab + i + 2);
d4 = _mm_loadu_si128(ab + i + 3);
y = _mm_xor_si128(y, d1);
y = _mult4xor_aesni(h1, h2, h3, h4, y, d2, d3, d4);
y = _mult4xor_aesni(h1,h2,h3,h4,y,d2,d3,d4);
}
for (i = pblocks; i < blocks; i++)
y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i));
if (rem) {
last = _mm_setzero_si128();
memcpy(&last, ab + blocks, rem);
memcpy(&last,ab + blocks,rem);
y = _ghash_aesni(_k.ni.h,y,last);
}
return y;
@ -395,7 +402,7 @@ private:
}
inline void _icv_crypt_aesni(__m128i y,__m128i j,uint8_t *icv,unsigned int icvsize) const
{
__m128i *ks,t,b;
__m128i t,b;
t = _mm_xor_si128(j,_k.ni.k[0]);
t = _mm_aesenc_si128(t,_k.ni.k[1]);
t = _mm_aesenc_si128(t,_k.ni.k[2]);
@ -418,7 +425,7 @@ private:
inline __m128i _encrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y) const
{
__m128i *ks,t,b;
__m128i t,b;
memset(&b,0,sizeof(b));
memcpy(&b,in,rem);
t = _mm_xor_si128(cb,_k.ni.k[0]);
@ -436,15 +443,15 @@ private:
t = _mm_aesenc_si128(t,_k.ni.k[12]);
t = _mm_aesenc_si128(t,_k.ni.k[13]);
t = _mm_aesenclast_si128(t,_k.ni.k[14]);
b = _mm_xor_si128(t, b);
b = _mm_xor_si128(t,b);
memcpy(out,&b,rem);
memset((u_char*)&b + rem,0,16 - rem);
return _ghash_aesni(_k.ni.h,y,b);
}
inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const
{
__m128i d1,d2,d3,d4,t1,t2,t3,t4;
__m128i *ks,y,j,cb,*bi,*bo;
__m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
__m128i y,j,cb,*bi,*bo;
j = _create_j_aesni(iv);
cb = _increment_be_aesni(j);
@ -461,102 +468,102 @@ private:
d2 = _mm_loadu_si128(bi + i + 1);
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
t1 = _mm_xor_si128(cb, _k.ni.k[0]);
t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
cb = _increment_be_aesni(cb);
t2 = _mm_xor_si128(cb, _k.ni.k[0]);
t2 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb);
t3 = _mm_xor_si128(cb, _k.ni.k[0]);
t3 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb);
t4 = _mm_xor_si128(cb, _k.ni.k[0]);
t4 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[1]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[1]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[1]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[2]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[2]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[2]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[3]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[3]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[3]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[4]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[4]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[4]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[5]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[5]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[5]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[6]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[6]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[6]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[7]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[7]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[7]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[8]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[8]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[8]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[9]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[9]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[9]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[10]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[10]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[10]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[11]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[11]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[11]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[12]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[12]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[12]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[13]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[13]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]);
t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]);
t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
t4 = _mm_xor_si128(t4, d4);
y = _mm_xor_si128(y, t1);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
t2 = _mm_aesenclast_si128(t2,k);
t3 = _mm_aesenclast_si128(t3,k);
t4 = _mm_aesenclast_si128(t4,k);
t1 = _mm_xor_si128(t1,d1);
t2 = _mm_xor_si128(t2,d2);
t3 = _mm_xor_si128(t3,d3);
t4 = _mm_xor_si128(t4,d4);
y = _mm_xor_si128(y,t1);
y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,t2,t3,t4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
_mm_storeu_si128(bo + i + 2, t3);
_mm_storeu_si128(bo + i + 3, t4);
_mm_storeu_si128(bo + i + 0,t1);
_mm_storeu_si128(bo + i + 1,t2);
_mm_storeu_si128(bo + i + 2,t3);
_mm_storeu_si128(bo + i + 3,t4);
}
for (i=pblocks;i<blocks;++i) {
d1 = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(cb, _k.ni.k[0]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
y = _ghash_aesni(_k.ni.h, y, t1);
t1 = _mm_xor_si128(cb,_k.ni.k[0]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
t1 = _mm_xor_si128(t1,d1);
_mm_storeu_si128(bo + i,t1);
y = _ghash_aesni(_k.ni.h,y,t1);
cb = _increment_be_aesni(cb);
}
@ -567,10 +574,10 @@ private:
}
inline __m128i _decrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y)
{
__m128i *ks, t, b;
memset(&b, 0, sizeof(b));
memcpy(&b, in, rem);
y = _ghash_aesni(_k.ni.h, y, b);
__m128i t,b;
memset(&b,0,sizeof(b));
memcpy(&b,in,rem);
y = _ghash_aesni(_k.ni.h,y,b);
t = _mm_xor_si128(cb,_k.ni.k[0]);
t = _mm_aesenc_si128(t,_k.ni.k[1]);
t = _mm_aesenc_si128(t,_k.ni.k[2]);
@ -585,16 +592,16 @@ private:
t = _mm_aesenc_si128(t,_k.ni.k[11]);
t = _mm_aesenc_si128(t,_k.ni.k[12]);
t = _mm_aesenc_si128(t,_k.ni.k[13]);
t = _mm_aesenclast_si128(t, _k.ni.k[14]);
b = _mm_xor_si128(t, b);
memcpy(out, &b, rem);
t = _mm_aesenclast_si128(t,_k.ni.k[14]);
b = _mm_xor_si128(t,b);
memcpy(out,&b,rem);
return y;
}
inline void decrypt_gcm256(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
inline void _decrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
{
__m128i d1, d2, d3, d4, t1, t2, t3, t4;
__m128i *ks, y, j, cb, *bi, *bo;
unsigned int blocks, pblocks, rem;
__m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
__m128i y,j,cb,*bi,*bo;
unsigned int blocks,pblocks,rem;
j = _create_j_aesni(iv);
cb = _increment_be_aesni(j);
@ -611,102 +618,102 @@ private:
d2 = _mm_loadu_si128(bi + i + 1);
d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1);
y = _mm_xor_si128(y,d1);
y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,d2,d3,d4);
t1 = _mm_xor_si128(cb, _k.ni.k[0]);
t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
cb = _increment_be_aesni(cb);
t2 = _mm_xor_si128(cb, _k.ni.k[0]);
t2 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb);
t3 = _mm_xor_si128(cb, _k.ni.k[0]);
t3 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb);
t4 = _mm_xor_si128(cb, _k.ni.k[0]);
t4 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[1]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[1]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[1]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[2]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[2]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[2]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[3]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[3]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[3]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[4]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[4]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[4]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[5]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[5]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[5]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[6]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[6]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[6]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[7]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[7]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[7]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[8]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[8]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[8]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[9]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[9]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[9]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[10]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[10]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[10]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[11]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[11]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[11]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[12]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[12]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[12]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[13]);
t3 = _mm_aesenc_si128(t3, _k.ni.k[13]);
t4 = _mm_aesenc_si128(t4, _k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]);
t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]);
t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]);
t1 = _mm_xor_si128(t1, d1);
t2 = _mm_xor_si128(t2, d2);
t3 = _mm_xor_si128(t3, d3);
t4 = _mm_xor_si128(t4, d4);
_mm_storeu_si128(bo + i + 0, t1);
_mm_storeu_si128(bo + i + 1, t2);
_mm_storeu_si128(bo + i + 2, t3);
_mm_storeu_si128(bo + i + 3, t4);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
t2 = _mm_aesenclast_si128(t2,k);
t3 = _mm_aesenclast_si128(t3,k);
t4 = _mm_aesenclast_si128(t4,k);
t1 = _mm_xor_si128(t1,d1);
t2 = _mm_xor_si128(t2,d2);
t3 = _mm_xor_si128(t3,d3);
t4 = _mm_xor_si128(t4,d4);
_mm_storeu_si128(bo + i + 0,t1);
_mm_storeu_si128(bo + i + 1,t2);
_mm_storeu_si128(bo + i + 2,t3);
_mm_storeu_si128(bo + i + 3,t4);
}
for (i=pblocks;i<blocks;i++) {
d1 = _mm_loadu_si128(bi + i);
y = _ghash_aesni(_k.ni.h,y,d1);
t1 = _mm_xor_si128(cb, _k.ni.k[0]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]);
t1 = _mm_xor_si128(t1, d1);
_mm_storeu_si128(bo + i, t1);
t1 = _mm_xor_si128(cb,_k.ni.k[0]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
t1 = _mm_xor_si128(t1,d1);
_mm_storeu_si128(bo + i,t1);
cb = _increment_be_aesni(cb);
}