More AES stuff

This commit is contained in:
Adam Ierymenko 2019-08-12 15:37:50 -07:00
parent f5e71f64ed
commit f7bc9f01c9
No known key found for this signature in database
GPG key ID: 1657198823E52A61

View file

@ -97,6 +97,13 @@ public:
inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen) inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
{ {
#ifdef ZT_AES_AESNI
if (HW_ACCEL) {
uint8_t tagbuf[16];
_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
return Utils::secureEq(tagbuf,tag,taglen);
}
#endif
abort(); // TODO: software abort(); // TODO: software
return false; return false;
} }
@ -217,41 +224,41 @@ private:
{ {
__m128i t1,t2,t3,t4,t5,t6; __m128i t1,t2,t3,t4,t5,t6;
y = _swap128_aesni(y); y = _swap128_aesni(y);
t1 = _mm_clmulepi64_si128(h, y, 0x00); t1 = _mm_clmulepi64_si128(h,y,0x00);
t2 = _mm_clmulepi64_si128(h, y, 0x01); t2 = _mm_clmulepi64_si128(h,y,0x01);
t3 = _mm_clmulepi64_si128(h, y, 0x10); t3 = _mm_clmulepi64_si128(h,y,0x10);
t4 = _mm_clmulepi64_si128(h, y, 0x11); t4 = _mm_clmulepi64_si128(h,y,0x11);
t2 = _mm_xor_si128(t2, t3); t2 = _mm_xor_si128(t2,t3);
t3 = _mm_slli_si128(t2, 8); t3 = _mm_slli_si128(t2,8);
t2 = _mm_srli_si128(t2, 8); t2 = _mm_srli_si128(t2,8);
t1 = _mm_xor_si128(t1, t3); t1 = _mm_xor_si128(t1,t3);
t4 = _mm_xor_si128(t4, t2); t4 = _mm_xor_si128(t4,t2);
t5 = _mm_srli_epi32(t1, 31); t5 = _mm_srli_epi32(t1,31);
t1 = _mm_slli_epi32(t1, 1); t1 = _mm_slli_epi32(t1,1);
t6 = _mm_srli_epi32(t4, 31); t6 = _mm_srli_epi32(t4,31);
t4 = _mm_slli_epi32(t4, 1); t4 = _mm_slli_epi32(t4,1);
t3 = _mm_srli_si128(t5, 12); t3 = _mm_srli_si128(t5,12);
t6 = _mm_slli_si128(t6, 4); t6 = _mm_slli_si128(t6,4);
t5 = _mm_slli_si128(t5, 4); t5 = _mm_slli_si128(t5,4);
t1 = _mm_or_si128(t1, t5); t1 = _mm_or_si128(t1,t5);
t4 = _mm_or_si128(t4, t6); t4 = _mm_or_si128(t4,t6);
t4 = _mm_or_si128(t4, t3); t4 = _mm_or_si128(t4,t3);
t5 = _mm_slli_epi32(t1, 31); t5 = _mm_slli_epi32(t1,31);
t6 = _mm_slli_epi32(t1, 30); t6 = _mm_slli_epi32(t1,30);
t3 = _mm_slli_epi32(t1, 25); t3 = _mm_slli_epi32(t1,25);
t5 = _mm_xor_si128(t5, t6); t5 = _mm_xor_si128(t5,t6);
t5 = _mm_xor_si128(t5, t3); t5 = _mm_xor_si128(t5,t3);
t6 = _mm_srli_si128(t5, 4); t6 = _mm_srli_si128(t5,4);
t4 = _mm_xor_si128(t4, t6); t4 = _mm_xor_si128(t4,t6);
t5 = _mm_slli_si128(t5, 12); t5 = _mm_slli_si128(t5,12);
t1 = _mm_xor_si128(t1, t5); t1 = _mm_xor_si128(t1,t5);
t4 = _mm_xor_si128(t4, t1); t4 = _mm_xor_si128(t4,t1);
t5 = _mm_srli_epi32(t1, 1); t5 = _mm_srli_epi32(t1,1);
t2 = _mm_srli_epi32(t1, 2); t2 = _mm_srli_epi32(t1,2);
t3 = _mm_srli_epi32(t1, 7); t3 = _mm_srli_epi32(t1,7);
t4 = _mm_xor_si128(t4, t2); t4 = _mm_xor_si128(t4,t2);
t4 = _mm_xor_si128(t4, t3); t4 = _mm_xor_si128(t4,t3);
t4 = _mm_xor_si128(t4, t5); t4 = _mm_xor_si128(t4,t5);
return _swap128_aesni(t4); return _swap128_aesni(t4);
} }
static inline __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4) static inline __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4)
@ -261,75 +268,75 @@ private:
d2 = _swap128_aesni(d2); d2 = _swap128_aesni(d2);
d3 = _swap128_aesni(d3); d3 = _swap128_aesni(d3);
d4 = _swap128_aesni(d4); d4 = _swap128_aesni(d4);
t0 = _mm_clmulepi64_si128(h1, d1, 0x00); t0 = _mm_clmulepi64_si128(h1,d1,0x00);
t1 = _mm_clmulepi64_si128(h2, d2, 0x00); t1 = _mm_clmulepi64_si128(h2,d2,0x00);
t2 = _mm_clmulepi64_si128(h3, d3, 0x00); t2 = _mm_clmulepi64_si128(h3,d3,0x00);
t3 = _mm_clmulepi64_si128(h4, d4, 0x00); t3 = _mm_clmulepi64_si128(h4,d4,0x00);
t8 = _mm_xor_si128(t0, t1); t8 = _mm_xor_si128(t0,t1);
t8 = _mm_xor_si128(t8, t2); t8 = _mm_xor_si128(t8,t2);
t8 = _mm_xor_si128(t8, t3); t8 = _mm_xor_si128(t8,t3);
t4 = _mm_clmulepi64_si128(h1, d1, 0x11); t4 = _mm_clmulepi64_si128(h1,d1,0x11);
t5 = _mm_clmulepi64_si128(h2, d2, 0x11); t5 = _mm_clmulepi64_si128(h2,d2,0x11);
t6 = _mm_clmulepi64_si128(h3, d3, 0x11); t6 = _mm_clmulepi64_si128(h3,d3,0x11);
t7 = _mm_clmulepi64_si128(h4, d4, 0x11); t7 = _mm_clmulepi64_si128(h4,d4,0x11);
t9 = _mm_xor_si128(t4, t5); t9 = _mm_xor_si128(t4,t5);
t9 = _mm_xor_si128(t9, t6); t9 = _mm_xor_si128(t9,t6);
t9 = _mm_xor_si128(t9, t7); t9 = _mm_xor_si128(t9,t7);
t0 = _mm_shuffle_epi32(h1, 78); t0 = _mm_shuffle_epi32(h1,78);
t4 = _mm_shuffle_epi32(d1, 78); t4 = _mm_shuffle_epi32(d1,78);
t0 = _mm_xor_si128(t0, h1); t0 = _mm_xor_si128(t0,h1);
t4 = _mm_xor_si128(t4, d1); t4 = _mm_xor_si128(t4,d1);
t1 = _mm_shuffle_epi32(h2, 78); t1 = _mm_shuffle_epi32(h2,78);
t5 = _mm_shuffle_epi32(d2, 78); t5 = _mm_shuffle_epi32(d2,78);
t1 = _mm_xor_si128(t1, h2); t1 = _mm_xor_si128(t1,h2);
t5 = _mm_xor_si128(t5, d2); t5 = _mm_xor_si128(t5,d2);
t2 = _mm_shuffle_epi32(h3, 78); t2 = _mm_shuffle_epi32(h3,78);
t6 = _mm_shuffle_epi32(d3, 78); t6 = _mm_shuffle_epi32(d3,78);
t2 = _mm_xor_si128(t2, h3); t2 = _mm_xor_si128(t2,h3);
t6 = _mm_xor_si128(t6, d3); t6 = _mm_xor_si128(t6,d3);
t3 = _mm_shuffle_epi32(h4, 78); t3 = _mm_shuffle_epi32(h4,78);
t7 = _mm_shuffle_epi32(d4, 78); t7 = _mm_shuffle_epi32(d4,78);
t3 = _mm_xor_si128(t3, h4); t3 = _mm_xor_si128(t3,h4);
t7 = _mm_xor_si128(t7, d4); t7 = _mm_xor_si128(t7,d4);
t0 = _mm_clmulepi64_si128(t0, t4, 0x00); t0 = _mm_clmulepi64_si128(t0,t4,0x00);
t1 = _mm_clmulepi64_si128(t1, t5, 0x00); t1 = _mm_clmulepi64_si128(t1,t5,0x00);
t2 = _mm_clmulepi64_si128(t2, t6, 0x00); t2 = _mm_clmulepi64_si128(t2,t6,0x00);
t3 = _mm_clmulepi64_si128(t3, t7, 0x00); t3 = _mm_clmulepi64_si128(t3,t7,0x00);
t0 = _mm_xor_si128(t0, t8); t0 = _mm_xor_si128(t0,t8);
t0 = _mm_xor_si128(t0, t9); t0 = _mm_xor_si128(t0,t9);
t0 = _mm_xor_si128(t1, t0); t0 = _mm_xor_si128(t1,t0);
t0 = _mm_xor_si128(t2, t0); t0 = _mm_xor_si128(t2,t0);
t0 = _mm_xor_si128(t3, t0); t0 = _mm_xor_si128(t3,t0);
t4 = _mm_slli_si128(t0, 8); t4 = _mm_slli_si128(t0,8);
t0 = _mm_srli_si128(t0, 8); t0 = _mm_srli_si128(t0,8);
t3 = _mm_xor_si128(t4, t8); t3 = _mm_xor_si128(t4,t8);
t6 = _mm_xor_si128(t0, t9); t6 = _mm_xor_si128(t0,t9);
t7 = _mm_srli_epi32(t3, 31); t7 = _mm_srli_epi32(t3,31);
t8 = _mm_srli_epi32(t6, 31); t8 = _mm_srli_epi32(t6,31);
t3 = _mm_slli_epi32(t3, 1); t3 = _mm_slli_epi32(t3,1);
t6 = _mm_slli_epi32(t6, 1); t6 = _mm_slli_epi32(t6,1);
t9 = _mm_srli_si128(t7, 12); t9 = _mm_srli_si128(t7,12);
t8 = _mm_slli_si128(t8, 4); t8 = _mm_slli_si128(t8,4);
t7 = _mm_slli_si128(t7, 4); t7 = _mm_slli_si128(t7,4);
t3 = _mm_or_si128(t3, t7); t3 = _mm_or_si128(t3,t7);
t6 = _mm_or_si128(t6, t8); t6 = _mm_or_si128(t6,t8);
t6 = _mm_or_si128(t6, t9); t6 = _mm_or_si128(t6,t9);
t7 = _mm_slli_epi32(t3, 31); t7 = _mm_slli_epi32(t3,31);
t8 = _mm_slli_epi32(t3, 30); t8 = _mm_slli_epi32(t3,30);
t9 = _mm_slli_epi32(t3, 25); t9 = _mm_slli_epi32(t3,25);
t7 = _mm_xor_si128(t7, t8); t7 = _mm_xor_si128(t7,t8);
t7 = _mm_xor_si128(t7, t9); t7 = _mm_xor_si128(t7,t9);
t8 = _mm_srli_si128(t7, 4); t8 = _mm_srli_si128(t7,4);
t7 = _mm_slli_si128(t7, 12); t7 = _mm_slli_si128(t7,12);
t3 = _mm_xor_si128(t3, t7); t3 = _mm_xor_si128(t3,t7);
t2 = _mm_srli_epi32(t3, 1); t2 = _mm_srli_epi32(t3,1);
t4 = _mm_srli_epi32(t3, 2); t4 = _mm_srli_epi32(t3,2);
t5 = _mm_srli_epi32(t3, 7); t5 = _mm_srli_epi32(t3,7);
t2 = _mm_xor_si128(t2, t4); t2 = _mm_xor_si128(t2,t4);
t2 = _mm_xor_si128(t2, t5); t2 = _mm_xor_si128(t2,t5);
t2 = _mm_xor_si128(t2, t8); t2 = _mm_xor_si128(t2,t8);
t3 = _mm_xor_si128(t3, t2); t3 = _mm_xor_si128(t3,t2);
t6 = _mm_xor_si128(t6, t3); t6 = _mm_xor_si128(t6,t3);
return _swap128_aesni(t6); return _swap128_aesni(t6);
} }
static inline __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); } static inline __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); }
@ -375,13 +382,13 @@ private:
d3 = _mm_loadu_si128(ab + i + 2); d3 = _mm_loadu_si128(ab + i + 2);
d4 = _mm_loadu_si128(ab + i + 3); d4 = _mm_loadu_si128(ab + i + 3);
y = _mm_xor_si128(y, d1); y = _mm_xor_si128(y, d1);
y = _mult4xor_aesni(h1, h2, h3, h4, y, d2, d3, d4); y = _mult4xor_aesni(h1,h2,h3,h4,y,d2,d3,d4);
} }
for (i = pblocks; i < blocks; i++) for (i = pblocks; i < blocks; i++)
y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i)); y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i));
if (rem) { if (rem) {
last = _mm_setzero_si128(); last = _mm_setzero_si128();
memcpy(&last, ab + blocks, rem); memcpy(&last,ab + blocks,rem);
y = _ghash_aesni(_k.ni.h,y,last); y = _ghash_aesni(_k.ni.h,y,last);
} }
return y; return y;
@ -395,7 +402,7 @@ private:
} }
inline void _icv_crypt_aesni(__m128i y,__m128i j,uint8_t *icv,unsigned int icvsize) const inline void _icv_crypt_aesni(__m128i y,__m128i j,uint8_t *icv,unsigned int icvsize) const
{ {
__m128i *ks,t,b; __m128i t,b;
t = _mm_xor_si128(j,_k.ni.k[0]); t = _mm_xor_si128(j,_k.ni.k[0]);
t = _mm_aesenc_si128(t,_k.ni.k[1]); t = _mm_aesenc_si128(t,_k.ni.k[1]);
t = _mm_aesenc_si128(t,_k.ni.k[2]); t = _mm_aesenc_si128(t,_k.ni.k[2]);
@ -418,7 +425,7 @@ private:
inline __m128i _encrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y) const inline __m128i _encrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y) const
{ {
__m128i *ks,t,b; __m128i t,b;
memset(&b,0,sizeof(b)); memset(&b,0,sizeof(b));
memcpy(&b,in,rem); memcpy(&b,in,rem);
t = _mm_xor_si128(cb,_k.ni.k[0]); t = _mm_xor_si128(cb,_k.ni.k[0]);
@ -436,15 +443,15 @@ private:
t = _mm_aesenc_si128(t,_k.ni.k[12]); t = _mm_aesenc_si128(t,_k.ni.k[12]);
t = _mm_aesenc_si128(t,_k.ni.k[13]); t = _mm_aesenc_si128(t,_k.ni.k[13]);
t = _mm_aesenclast_si128(t,_k.ni.k[14]); t = _mm_aesenclast_si128(t,_k.ni.k[14]);
b = _mm_xor_si128(t, b); b = _mm_xor_si128(t,b);
memcpy(out,&b,rem); memcpy(out,&b,rem);
memset((u_char*)&b + rem,0,16 - rem); memset((u_char*)&b + rem,0,16 - rem);
return _ghash_aesni(_k.ni.h,y,b); return _ghash_aesni(_k.ni.h,y,b);
} }
inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const
{ {
__m128i d1,d2,d3,d4,t1,t2,t3,t4; __m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
__m128i *ks,y,j,cb,*bi,*bo; __m128i y,j,cb,*bi,*bo;
j = _create_j_aesni(iv); j = _create_j_aesni(iv);
cb = _increment_be_aesni(j); cb = _increment_be_aesni(j);
@ -461,102 +468,102 @@ private:
d2 = _mm_loadu_si128(bi + i + 1); d2 = _mm_loadu_si128(bi + i + 1);
d3 = _mm_loadu_si128(bi + i + 2); d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3); d4 = _mm_loadu_si128(bi + i + 3);
t1 = _mm_xor_si128(cb, _k.ni.k[0]); t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t2 = _mm_xor_si128(cb, _k.ni.k[0]); t2 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t3 = _mm_xor_si128(cb, _k.ni.k[0]); t3 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t4 = _mm_xor_si128(cb, _k.ni.k[0]); t4 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[1]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[1]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[1]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[2]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[2]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[2]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[3]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[3]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[3]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[4]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[4]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[4]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[5]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[5]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[5]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[6]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[6]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[6]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[7]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[7]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[7]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[8]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[8]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[8]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[9]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[9]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[9]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[10]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[10]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[10]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[11]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[11]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[11]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[12]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[12]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[12]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[13]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[13]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[13]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]); t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]); t2 = _mm_aesenclast_si128(t2,k);
t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]); t3 = _mm_aesenclast_si128(t3,k);
t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]); t4 = _mm_aesenclast_si128(t4,k);
t1 = _mm_xor_si128(t1, d1); t1 = _mm_xor_si128(t1,d1);
t2 = _mm_xor_si128(t2, d2); t2 = _mm_xor_si128(t2,d2);
t3 = _mm_xor_si128(t3, d3); t3 = _mm_xor_si128(t3,d3);
t4 = _mm_xor_si128(t4, d4); t4 = _mm_xor_si128(t4,d4);
y = _mm_xor_si128(y, t1); y = _mm_xor_si128(y,t1);
y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,t2,t3,t4); y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,t2,t3,t4);
_mm_storeu_si128(bo + i + 0, t1); _mm_storeu_si128(bo + i + 0,t1);
_mm_storeu_si128(bo + i + 1, t2); _mm_storeu_si128(bo + i + 1,t2);
_mm_storeu_si128(bo + i + 2, t3); _mm_storeu_si128(bo + i + 2,t3);
_mm_storeu_si128(bo + i + 3, t4); _mm_storeu_si128(bo + i + 3,t4);
} }
for (i=pblocks;i<blocks;++i) { for (i=pblocks;i<blocks;++i) {
d1 = _mm_loadu_si128(bi + i); d1 = _mm_loadu_si128(bi + i);
t1 = _mm_xor_si128(cb, _k.ni.k[0]); t1 = _mm_xor_si128(cb,_k.ni.k[0]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]); t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]); t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]); t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]); t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]); t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]); t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]); t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]); t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]); t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]); t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]); t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]); t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]); t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]); t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
t1 = _mm_xor_si128(t1, d1); t1 = _mm_xor_si128(t1,d1);
_mm_storeu_si128(bo + i, t1); _mm_storeu_si128(bo + i,t1);
y = _ghash_aesni(_k.ni.h, y, t1); y = _ghash_aesni(_k.ni.h,y,t1);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
} }
@ -567,10 +574,10 @@ private:
} }
inline __m128i _decrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y) inline __m128i _decrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y)
{ {
__m128i *ks, t, b; __m128i t,b;
memset(&b, 0, sizeof(b)); memset(&b,0,sizeof(b));
memcpy(&b, in, rem); memcpy(&b,in,rem);
y = _ghash_aesni(_k.ni.h, y, b); y = _ghash_aesni(_k.ni.h,y,b);
t = _mm_xor_si128(cb,_k.ni.k[0]); t = _mm_xor_si128(cb,_k.ni.k[0]);
t = _mm_aesenc_si128(t,_k.ni.k[1]); t = _mm_aesenc_si128(t,_k.ni.k[1]);
t = _mm_aesenc_si128(t,_k.ni.k[2]); t = _mm_aesenc_si128(t,_k.ni.k[2]);
@ -585,16 +592,16 @@ private:
t = _mm_aesenc_si128(t,_k.ni.k[11]); t = _mm_aesenc_si128(t,_k.ni.k[11]);
t = _mm_aesenc_si128(t,_k.ni.k[12]); t = _mm_aesenc_si128(t,_k.ni.k[12]);
t = _mm_aesenc_si128(t,_k.ni.k[13]); t = _mm_aesenc_si128(t,_k.ni.k[13]);
t = _mm_aesenclast_si128(t, _k.ni.k[14]); t = _mm_aesenclast_si128(t,_k.ni.k[14]);
b = _mm_xor_si128(t, b); b = _mm_xor_si128(t,b);
memcpy(out, &b, rem); memcpy(out,&b,rem);
return y; return y;
} }
inline void decrypt_gcm256(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) inline void _decrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize)
{ {
__m128i d1, d2, d3, d4, t1, t2, t3, t4; __m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
__m128i *ks, y, j, cb, *bi, *bo; __m128i y,j,cb,*bi,*bo;
unsigned int blocks, pblocks, rem; unsigned int blocks,pblocks,rem;
j = _create_j_aesni(iv); j = _create_j_aesni(iv);
cb = _increment_be_aesni(j); cb = _increment_be_aesni(j);
@ -611,102 +618,102 @@ private:
d2 = _mm_loadu_si128(bi + i + 1); d2 = _mm_loadu_si128(bi + i + 1);
d3 = _mm_loadu_si128(bi + i + 2); d3 = _mm_loadu_si128(bi + i + 2);
d4 = _mm_loadu_si128(bi + i + 3); d4 = _mm_loadu_si128(bi + i + 3);
y = _mm_xor_si128(y, d1); y = _mm_xor_si128(y,d1);
y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,d2,d3,d4); y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,d2,d3,d4);
t1 = _mm_xor_si128(cb, _k.ni.k[0]); t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t2 = _mm_xor_si128(cb, _k.ni.k[0]); t2 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t3 = _mm_xor_si128(cb, _k.ni.k[0]); t3 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t4 = _mm_xor_si128(cb, _k.ni.k[0]); t4 = _mm_xor_si128(cb,k);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[1]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[1]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[1]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[2]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[2]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[2]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[3]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[3]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[3]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[4]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[4]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[4]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[5]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[5]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[5]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[6]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[6]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[6]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[7]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[7]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[7]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[8]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[8]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[8]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[9]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[9]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[9]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[10]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[10]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[10]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[11]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[11]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[11]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[12]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[12]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[12]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]); t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
t2 = _mm_aesenc_si128(t2, _k.ni.k[13]); t2 = _mm_aesenc_si128(t2,k);
t3 = _mm_aesenc_si128(t3, _k.ni.k[13]); t3 = _mm_aesenc_si128(t3,k);
t4 = _mm_aesenc_si128(t4, _k.ni.k[13]); t4 = _mm_aesenc_si128(t4,k);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]); t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]); t2 = _mm_aesenclast_si128(t2,k);
t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]); t3 = _mm_aesenclast_si128(t3,k);
t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]); t4 = _mm_aesenclast_si128(t4,k);
t1 = _mm_xor_si128(t1, d1); t1 = _mm_xor_si128(t1,d1);
t2 = _mm_xor_si128(t2, d2); t2 = _mm_xor_si128(t2,d2);
t3 = _mm_xor_si128(t3, d3); t3 = _mm_xor_si128(t3,d3);
t4 = _mm_xor_si128(t4, d4); t4 = _mm_xor_si128(t4,d4);
_mm_storeu_si128(bo + i + 0, t1); _mm_storeu_si128(bo + i + 0,t1);
_mm_storeu_si128(bo + i + 1, t2); _mm_storeu_si128(bo + i + 1,t2);
_mm_storeu_si128(bo + i + 2, t3); _mm_storeu_si128(bo + i + 2,t3);
_mm_storeu_si128(bo + i + 3, t4); _mm_storeu_si128(bo + i + 3,t4);
} }
for (i=pblocks;i<blocks;i++) { for (i=pblocks;i<blocks;i++) {
d1 = _mm_loadu_si128(bi + i); d1 = _mm_loadu_si128(bi + i);
y = _ghash_aesni(_k.ni.h,y,d1); y = _ghash_aesni(_k.ni.h,y,d1);
t1 = _mm_xor_si128(cb, _k.ni.k[0]); t1 = _mm_xor_si128(cb,_k.ni.k[0]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[1]); t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[2]); t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[3]); t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[4]); t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[5]); t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[6]); t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[7]); t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[8]); t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[9]); t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[10]); t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[11]); t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[12]); t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
t1 = _mm_aesenc_si128(t1, _k.ni.k[13]); t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]); t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
t1 = _mm_xor_si128(t1, d1); t1 = _mm_xor_si128(t1,d1);
_mm_storeu_si128(bo + i, t1); _mm_storeu_si128(bo + i,t1);
cb = _increment_be_aesni(cb); cb = _increment_be_aesni(cb);
} }