From 0aa78d0ecb4bfeb7cd15c38a42525ddc87df7237 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Fri, 27 Mar 2020 19:03:09 -0700 Subject: [PATCH] Small fixes. --- CMakeLists.txt | 1 + node/OS.hpp | 1 + node/Utils.hpp | 119 ++++++++++++++++++++++++++++--------------------- 3 files changed, 69 insertions(+), 52 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5d1fd672..18f8b1bf3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,7 @@ else(WIN32) -Wall -Wno-deprecated -Wno-unused-function + -Wno-format $<$:-g> $<$:-O0> $<$:-O3> diff --git a/node/OS.hpp b/node/OS.hpp index 13d50068f..de719ee7a 100644 --- a/node/OS.hpp +++ b/node/OS.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #if defined(_WIN32) || defined(_WIN64) #ifdef _MSC_VER diff --git a/node/Utils.hpp b/node/Utils.hpp index 298130530..14f7b2fc2 100644 --- a/node/Utils.hpp +++ b/node/Utils.hpp @@ -593,76 +593,55 @@ static ZT_INLINE void storeLittleEndian(void *const p,const I i) noexcept template static ZT_INLINE void copy(void *const dest,const void *const src) noexcept { -#ifdef ZT_NO_UNALIGNED_ACCESS - if ((((uintptr_t)dest | (uintptr_t)src) & (sizeof(uintptr_t) - 1)) != 0) { - memcpy(dest,src,L); - return; - } -#endif - - uint8_t *d = reinterpret_cast(dest); - const uint8_t *s = reinterpret_cast(src); - #ifdef ZT_ARCH_X64 - for(unsigned int i=0;i<(L / 64U);++i) { + uint8_t *volatile d = reinterpret_cast(dest); + const uint8_t *s = reinterpret_cast(src); + for(unsigned int i=0;i<(L >> 6U);++i) { __m128i x0 = _mm_loadu_si128(reinterpret_cast(s)); - __m128i x1 = _mm_loadu_si128(reinterpret_cast(s) + 1); - __m128i x2 = _mm_loadu_si128(reinterpret_cast(s) + 2); - __m128i x3 = _mm_loadu_si128(reinterpret_cast(s) + 3); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 1,x1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 2,x2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 3,x3); - d += 64; + __m128i x1 = _mm_loadu_si128(reinterpret_cast(s + 16)); + __m128i x2 = _mm_loadu_si128(reinterpret_cast(s + 32)); + __m128i x3 = _mm_loadu_si128(reinterpret_cast(s + 48)); s += 64; - } - if ((L & 63U) >= 32U) { - __m128i x0 = _mm_loadu_si128(reinterpret_cast(s)); - __m128i x1 = _mm_loadu_si128(reinterpret_cast(s) + 1); _mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 1,x1); - d += 32; - s += 32; + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),x1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),x2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),x3); + d += 64; } - if ((L & 31U) >= 16U) { + if ((L & 32U) != 0) { __m128i x0 = _mm_loadu_si128(reinterpret_cast(s)); + __m128i x1 = _mm_loadu_si128(reinterpret_cast(s + 16)); + s += 32; + _mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),x1); + d += 32; + } + if ((L & 16U) != 0) { + __m128i x0 = _mm_loadu_si128(reinterpret_cast(s)); + s += 16; _mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0); d += 16; - s += 16; } - if ((L & 15U) >= 8U) { - *reinterpret_cast(d) = *reinterpret_cast(s); - d += 8; + if ((L & 8U) != 0) { + *reinterpret_cast(d) = *reinterpret_cast(s); s += 8; + d += 8; } - if ((L & 7U) >= 4U) { - *reinterpret_cast(d) = *reinterpret_cast(s); - d += 4; + if ((L & 4U) != 0) { + *reinterpret_cast(d) = *reinterpret_cast(s); s += 4; + d += 4; } - if ((L & 3U) >= 2U) { - *reinterpret_cast(d) = *reinterpret_cast(s); - d += 2; + if ((L & 2U) != 0) { + *reinterpret_cast(d) = *reinterpret_cast(s); s += 2; + d += 2; } - if ((L & 1U) != 0U) { + if ((L & 1U) != 0) { *d = *s; } #else - for(unsigned int i=0;i<(L / (sizeof(uintptr_t) * 4));++i) { - uintptr_t x0 = reinterpret_cast(s)[0]; - uintptr_t x1 = reinterpret_cast(s)[1]; - uintptr_t x2 = reinterpret_cast(s)[2]; - uintptr_t x3 = reinterpret_cast(s)[3]; - reinterpret_cast(d)[0] = x0; - reinterpret_cast(d)[1] = x1; - reinterpret_cast(d)[2] = x2; - reinterpret_cast(d)[3] = x3; - s += (sizeof(uintptr_t) * 4); - d += (sizeof(uintptr_t) * 4); - } - for(unsigned int i=0;i<(L & ((sizeof(uintptr_t) * 4) - 1));++i) - d[i] = s[i]; + memcpy(dest,src,L); #endif } @@ -687,7 +666,43 @@ static ZT_INLINE void copy(void *const dest,const void *const src,unsigned int l template static ZT_INLINE void zero(void *const dest) noexcept { +#ifdef ZT_ARCH_X64 + uint8_t *volatile d = reinterpret_cast(dest); + __m128i z = _mm_setzero_si128(); + for(unsigned int i=0;i<(L >> 6U);++i) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(d),z); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),z); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),z); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),z); + d += 64; + } + if ((L & 32U) != 0) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(d),z); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),z); + d += 32; + } + if ((L & 16U) != 0) { + _mm_storeu_si128(reinterpret_cast<__m128i *>(d),z); + d += 16; + } + if ((L & 8U) != 0) { + *reinterpret_cast(d) = 0; + d += 8; + } + if ((L & 4U) != 0) { + *reinterpret_cast(d) = 0; + d += 4; + } + if ((L & 2U) != 0) { + *reinterpret_cast(d) = 0; + d += 2; + } + if ((L & 1U) != 0) { + *d = 0; + } +#else memset(dest,0,L); +#endif } /**