From f7bc9f01c988fdc064972d5863fbe19820161567 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Mon, 12 Aug 2019 15:37:50 -0700 Subject: [PATCH] More AES stuff --- node/AES.hpp | 597 ++++++++++++++++++++++++++------------------------- 1 file changed, 302 insertions(+), 295 deletions(-) diff --git a/node/AES.hpp b/node/AES.hpp index b80e337dd..4436fbd15 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -97,6 +97,13 @@ public: inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen) { +#ifdef ZT_AES_AESNI + if (HW_ACCEL) { + uint8_t tagbuf[16]; + _decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen); + return Utils::secureEq(tagbuf,tag,taglen); + } +#endif abort(); // TODO: software return false; } @@ -217,41 +224,41 @@ private: { __m128i t1,t2,t3,t4,t5,t6; y = _swap128_aesni(y); - t1 = _mm_clmulepi64_si128(h, y, 0x00); - t2 = _mm_clmulepi64_si128(h, y, 0x01); - t3 = _mm_clmulepi64_si128(h, y, 0x10); - t4 = _mm_clmulepi64_si128(h, y, 0x11); - t2 = _mm_xor_si128(t2, t3); - t3 = _mm_slli_si128(t2, 8); - t2 = _mm_srli_si128(t2, 8); - t1 = _mm_xor_si128(t1, t3); - t4 = _mm_xor_si128(t4, t2); - t5 = _mm_srli_epi32(t1, 31); - t1 = _mm_slli_epi32(t1, 1); - t6 = _mm_srli_epi32(t4, 31); - t4 = _mm_slli_epi32(t4, 1); - t3 = _mm_srli_si128(t5, 12); - t6 = _mm_slli_si128(t6, 4); - t5 = _mm_slli_si128(t5, 4); - t1 = _mm_or_si128(t1, t5); - t4 = _mm_or_si128(t4, t6); - t4 = _mm_or_si128(t4, t3); - t5 = _mm_slli_epi32(t1, 31); - t6 = _mm_slli_epi32(t1, 30); - t3 = _mm_slli_epi32(t1, 25); - t5 = _mm_xor_si128(t5, t6); - t5 = _mm_xor_si128(t5, t3); - t6 = _mm_srli_si128(t5, 4); - t4 = _mm_xor_si128(t4, t6); - t5 = _mm_slli_si128(t5, 12); - t1 = _mm_xor_si128(t1, t5); - t4 = _mm_xor_si128(t4, t1); - t5 = _mm_srli_epi32(t1, 1); - t2 = _mm_srli_epi32(t1, 2); - t3 = _mm_srli_epi32(t1, 7); - t4 = _mm_xor_si128(t4, t2); - t4 = _mm_xor_si128(t4, t3); - t4 = _mm_xor_si128(t4, t5); + t1 = _mm_clmulepi64_si128(h,y,0x00); + t2 = _mm_clmulepi64_si128(h,y,0x01); + t3 = _mm_clmulepi64_si128(h,y,0x10); + t4 = _mm_clmulepi64_si128(h,y,0x11); + t2 = _mm_xor_si128(t2,t3); + t3 = _mm_slli_si128(t2,8); + t2 = _mm_srli_si128(t2,8); + t1 = _mm_xor_si128(t1,t3); + t4 = _mm_xor_si128(t4,t2); + t5 = _mm_srli_epi32(t1,31); + t1 = _mm_slli_epi32(t1,1); + t6 = _mm_srli_epi32(t4,31); + t4 = _mm_slli_epi32(t4,1); + t3 = _mm_srli_si128(t5,12); + t6 = _mm_slli_si128(t6,4); + t5 = _mm_slli_si128(t5,4); + t1 = _mm_or_si128(t1,t5); + t4 = _mm_or_si128(t4,t6); + t4 = _mm_or_si128(t4,t3); + t5 = _mm_slli_epi32(t1,31); + t6 = _mm_slli_epi32(t1,30); + t3 = _mm_slli_epi32(t1,25); + t5 = _mm_xor_si128(t5,t6); + t5 = _mm_xor_si128(t5,t3); + t6 = _mm_srli_si128(t5,4); + t4 = _mm_xor_si128(t4,t6); + t5 = _mm_slli_si128(t5,12); + t1 = _mm_xor_si128(t1,t5); + t4 = _mm_xor_si128(t4,t1); + t5 = _mm_srli_epi32(t1,1); + t2 = _mm_srli_epi32(t1,2); + t3 = _mm_srli_epi32(t1,7); + t4 = _mm_xor_si128(t4,t2); + t4 = _mm_xor_si128(t4,t3); + t4 = _mm_xor_si128(t4,t5); return _swap128_aesni(t4); } static inline __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4) @@ -261,75 +268,75 @@ private: d2 = _swap128_aesni(d2); d3 = _swap128_aesni(d3); d4 = _swap128_aesni(d4); - t0 = _mm_clmulepi64_si128(h1, d1, 0x00); - t1 = _mm_clmulepi64_si128(h2, d2, 0x00); - t2 = _mm_clmulepi64_si128(h3, d3, 0x00); - t3 = _mm_clmulepi64_si128(h4, d4, 0x00); - t8 = _mm_xor_si128(t0, t1); - t8 = _mm_xor_si128(t8, t2); - t8 = _mm_xor_si128(t8, t3); - t4 = _mm_clmulepi64_si128(h1, d1, 0x11); - t5 = _mm_clmulepi64_si128(h2, d2, 0x11); - t6 = _mm_clmulepi64_si128(h3, d3, 0x11); - t7 = _mm_clmulepi64_si128(h4, d4, 0x11); - t9 = _mm_xor_si128(t4, t5); - t9 = _mm_xor_si128(t9, t6); - t9 = _mm_xor_si128(t9, t7); - t0 = _mm_shuffle_epi32(h1, 78); - t4 = _mm_shuffle_epi32(d1, 78); - t0 = _mm_xor_si128(t0, h1); - t4 = _mm_xor_si128(t4, d1); - t1 = _mm_shuffle_epi32(h2, 78); - t5 = _mm_shuffle_epi32(d2, 78); - t1 = _mm_xor_si128(t1, h2); - t5 = _mm_xor_si128(t5, d2); - t2 = _mm_shuffle_epi32(h3, 78); - t6 = _mm_shuffle_epi32(d3, 78); - t2 = _mm_xor_si128(t2, h3); - t6 = _mm_xor_si128(t6, d3); - t3 = _mm_shuffle_epi32(h4, 78); - t7 = _mm_shuffle_epi32(d4, 78); - t3 = _mm_xor_si128(t3, h4); - t7 = _mm_xor_si128(t7, d4); - t0 = _mm_clmulepi64_si128(t0, t4, 0x00); - t1 = _mm_clmulepi64_si128(t1, t5, 0x00); - t2 = _mm_clmulepi64_si128(t2, t6, 0x00); - t3 = _mm_clmulepi64_si128(t3, t7, 0x00); - t0 = _mm_xor_si128(t0, t8); - t0 = _mm_xor_si128(t0, t9); - t0 = _mm_xor_si128(t1, t0); - t0 = _mm_xor_si128(t2, t0); - t0 = _mm_xor_si128(t3, t0); - t4 = _mm_slli_si128(t0, 8); - t0 = _mm_srli_si128(t0, 8); - t3 = _mm_xor_si128(t4, t8); - t6 = _mm_xor_si128(t0, t9); - t7 = _mm_srli_epi32(t3, 31); - t8 = _mm_srli_epi32(t6, 31); - t3 = _mm_slli_epi32(t3, 1); - t6 = _mm_slli_epi32(t6, 1); - t9 = _mm_srli_si128(t7, 12); - t8 = _mm_slli_si128(t8, 4); - t7 = _mm_slli_si128(t7, 4); - t3 = _mm_or_si128(t3, t7); - t6 = _mm_or_si128(t6, t8); - t6 = _mm_or_si128(t6, t9); - t7 = _mm_slli_epi32(t3, 31); - t8 = _mm_slli_epi32(t3, 30); - t9 = _mm_slli_epi32(t3, 25); - t7 = _mm_xor_si128(t7, t8); - t7 = _mm_xor_si128(t7, t9); - t8 = _mm_srli_si128(t7, 4); - t7 = _mm_slli_si128(t7, 12); - t3 = _mm_xor_si128(t3, t7); - t2 = _mm_srli_epi32(t3, 1); - t4 = _mm_srli_epi32(t3, 2); - t5 = _mm_srli_epi32(t3, 7); - t2 = _mm_xor_si128(t2, t4); - t2 = _mm_xor_si128(t2, t5); - t2 = _mm_xor_si128(t2, t8); - t3 = _mm_xor_si128(t3, t2); - t6 = _mm_xor_si128(t6, t3); + t0 = _mm_clmulepi64_si128(h1,d1,0x00); + t1 = _mm_clmulepi64_si128(h2,d2,0x00); + t2 = _mm_clmulepi64_si128(h3,d3,0x00); + t3 = _mm_clmulepi64_si128(h4,d4,0x00); + t8 = _mm_xor_si128(t0,t1); + t8 = _mm_xor_si128(t8,t2); + t8 = _mm_xor_si128(t8,t3); + t4 = _mm_clmulepi64_si128(h1,d1,0x11); + t5 = _mm_clmulepi64_si128(h2,d2,0x11); + t6 = _mm_clmulepi64_si128(h3,d3,0x11); + t7 = _mm_clmulepi64_si128(h4,d4,0x11); + t9 = _mm_xor_si128(t4,t5); + t9 = _mm_xor_si128(t9,t6); + t9 = _mm_xor_si128(t9,t7); + t0 = _mm_shuffle_epi32(h1,78); + t4 = _mm_shuffle_epi32(d1,78); + t0 = _mm_xor_si128(t0,h1); + t4 = _mm_xor_si128(t4,d1); + t1 = _mm_shuffle_epi32(h2,78); + t5 = _mm_shuffle_epi32(d2,78); + t1 = _mm_xor_si128(t1,h2); + t5 = _mm_xor_si128(t5,d2); + t2 = _mm_shuffle_epi32(h3,78); + t6 = _mm_shuffle_epi32(d3,78); + t2 = _mm_xor_si128(t2,h3); + t6 = _mm_xor_si128(t6,d3); + t3 = _mm_shuffle_epi32(h4,78); + t7 = _mm_shuffle_epi32(d4,78); + t3 = _mm_xor_si128(t3,h4); + t7 = _mm_xor_si128(t7,d4); + t0 = _mm_clmulepi64_si128(t0,t4,0x00); + t1 = _mm_clmulepi64_si128(t1,t5,0x00); + t2 = _mm_clmulepi64_si128(t2,t6,0x00); + t3 = _mm_clmulepi64_si128(t3,t7,0x00); + t0 = _mm_xor_si128(t0,t8); + t0 = _mm_xor_si128(t0,t9); + t0 = _mm_xor_si128(t1,t0); + t0 = _mm_xor_si128(t2,t0); + t0 = _mm_xor_si128(t3,t0); + t4 = _mm_slli_si128(t0,8); + t0 = _mm_srli_si128(t0,8); + t3 = _mm_xor_si128(t4,t8); + t6 = _mm_xor_si128(t0,t9); + t7 = _mm_srli_epi32(t3,31); + t8 = _mm_srli_epi32(t6,31); + t3 = _mm_slli_epi32(t3,1); + t6 = _mm_slli_epi32(t6,1); + t9 = _mm_srli_si128(t7,12); + t8 = _mm_slli_si128(t8,4); + t7 = _mm_slli_si128(t7,4); + t3 = _mm_or_si128(t3,t7); + t6 = _mm_or_si128(t6,t8); + t6 = _mm_or_si128(t6,t9); + t7 = _mm_slli_epi32(t3,31); + t8 = _mm_slli_epi32(t3,30); + t9 = _mm_slli_epi32(t3,25); + t7 = _mm_xor_si128(t7,t8); + t7 = _mm_xor_si128(t7,t9); + t8 = _mm_srli_si128(t7,4); + t7 = _mm_slli_si128(t7,12); + t3 = _mm_xor_si128(t3,t7); + t2 = _mm_srli_epi32(t3,1); + t4 = _mm_srli_epi32(t3,2); + t5 = _mm_srli_epi32(t3,7); + t2 = _mm_xor_si128(t2,t4); + t2 = _mm_xor_si128(t2,t5); + t2 = _mm_xor_si128(t2,t8); + t3 = _mm_xor_si128(t3,t2); + t6 = _mm_xor_si128(t6,t3); return _swap128_aesni(t6); } static inline __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); } @@ -375,13 +382,13 @@ private: d3 = _mm_loadu_si128(ab + i + 2); d4 = _mm_loadu_si128(ab + i + 3); y = _mm_xor_si128(y, d1); - y = _mult4xor_aesni(h1, h2, h3, h4, y, d2, d3, d4); + y = _mult4xor_aesni(h1,h2,h3,h4,y,d2,d3,d4); } for (i = pblocks; i < blocks; i++) y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i)); if (rem) { last = _mm_setzero_si128(); - memcpy(&last, ab + blocks, rem); + memcpy(&last,ab + blocks,rem); y = _ghash_aesni(_k.ni.h,y,last); } return y; @@ -395,7 +402,7 @@ private: } inline void _icv_crypt_aesni(__m128i y,__m128i j,uint8_t *icv,unsigned int icvsize) const { - __m128i *ks,t,b; + __m128i t,b; t = _mm_xor_si128(j,_k.ni.k[0]); t = _mm_aesenc_si128(t,_k.ni.k[1]); t = _mm_aesenc_si128(t,_k.ni.k[2]); @@ -418,7 +425,7 @@ private: inline __m128i _encrypt_gcm_rem_aesni(unsigned int rem,const void *in,void *out,__m128i cb,__m128i y) const { - __m128i *ks,t,b; + __m128i t,b; memset(&b,0,sizeof(b)); memcpy(&b,in,rem); t = _mm_xor_si128(cb,_k.ni.k[0]); @@ -436,15 +443,15 @@ private: t = _mm_aesenc_si128(t,_k.ni.k[12]); t = _mm_aesenc_si128(t,_k.ni.k[13]); t = _mm_aesenclast_si128(t,_k.ni.k[14]); - b = _mm_xor_si128(t, b); + b = _mm_xor_si128(t,b); memcpy(out,&b,rem); memset((u_char*)&b + rem,0,16 - rem); return _ghash_aesni(_k.ni.h,y,b); } inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const { - __m128i d1,d2,d3,d4,t1,t2,t3,t4; - __m128i *ks,y,j,cb,*bi,*bo; + __m128i d1,d2,d3,d4,t1,t2,t3,t4,k; + __m128i y,j,cb,*bi,*bo; j = _create_j_aesni(iv); cb = _increment_be_aesni(j); @@ -461,102 +468,102 @@ private: d2 = _mm_loadu_si128(bi + i + 1); d3 = _mm_loadu_si128(bi + i + 2); d4 = _mm_loadu_si128(bi + i + 3); - t1 = _mm_xor_si128(cb, _k.ni.k[0]); + t1 = _mm_xor_si128(cb,k = _k.ni.k[0]); cb = _increment_be_aesni(cb); - t2 = _mm_xor_si128(cb, _k.ni.k[0]); + t2 = _mm_xor_si128(cb,k); cb = _increment_be_aesni(cb); - t3 = _mm_xor_si128(cb, _k.ni.k[0]); + t3 = _mm_xor_si128(cb,k); cb = _increment_be_aesni(cb); - t4 = _mm_xor_si128(cb, _k.ni.k[0]); + t4 = _mm_xor_si128(cb,k); cb = _increment_be_aesni(cb); - t1 = _mm_aesenc_si128(t1, _k.ni.k[1]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[1]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[1]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[1]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[2]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[2]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[2]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[2]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[3]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[3]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[3]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[3]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[4]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[4]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[4]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[4]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[5]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[5]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[5]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[5]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[6]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[6]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[6]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[6]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[7]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[7]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[7]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[7]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[8]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[8]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[8]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[8]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[9]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[9]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[9]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[9]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[10]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[10]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[10]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[10]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[11]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[11]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[11]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[11]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[12]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[12]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[12]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[12]); - t1 = _mm_aesenc_si128(t1, _k.ni.k[13]); - t2 = _mm_aesenc_si128(t2, _k.ni.k[13]); - t3 = _mm_aesenc_si128(t3, _k.ni.k[13]); - t4 = _mm_aesenc_si128(t4, _k.ni.k[13]); - t1 = _mm_aesenclast_si128(t1, _k.ni.k[14]); - t2 = _mm_aesenclast_si128(t2, _k.ni.k[14]); - t3 = _mm_aesenclast_si128(t3, _k.ni.k[14]); - t4 = _mm_aesenclast_si128(t4, _k.ni.k[14]); - t1 = _mm_xor_si128(t1, d1); - t2 = _mm_xor_si128(t2, d2); - t3 = _mm_xor_si128(t3, d3); - t4 = _mm_xor_si128(t4, d4); - y = _mm_xor_si128(y, t1); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]); + t2 = _mm_aesenc_si128(t2,k); + t3 = _mm_aesenc_si128(t3,k); + t4 = _mm_aesenc_si128(t4,k); + t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]); + t2 = _mm_aesenclast_si128(t2,k); + t3 = _mm_aesenclast_si128(t3,k); + t4 = _mm_aesenclast_si128(t4,k); + t1 = _mm_xor_si128(t1,d1); + t2 = _mm_xor_si128(t2,d2); + t3 = _mm_xor_si128(t3,d3); + t4 = _mm_xor_si128(t4,d4); + y = _mm_xor_si128(y,t1); y = _mult4xor_aesni(_k.ni.hhhh,_k.ni.hhh,_k.ni.hh,_k.ni.h,y,t2,t3,t4); - _mm_storeu_si128(bo + i + 0, t1); - _mm_storeu_si128(bo + i + 1, t2); - _mm_storeu_si128(bo + i + 2, t3); - _mm_storeu_si128(bo + i + 3, t4); + _mm_storeu_si128(bo + i + 0,t1); + _mm_storeu_si128(bo + i + 1,t2); + _mm_storeu_si128(bo + i + 2,t3); + _mm_storeu_si128(bo + i + 3,t4); } for (i=pblocks;i