From bd0299f392007337c4baced3c42616e91dee9fd2 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 2 Jul 2020 10:33:34 -0700 Subject: [PATCH] Some cleanup and optimization. --- core/OS.hpp | 1 + core/SHA512.cpp | 6 ++--- core/Utils.hpp | 62 ++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 9 deletions(-) diff --git a/core/OS.hpp b/core/OS.hpp index f7cb1a4a9..952c73508 100644 --- a/core/OS.hpp +++ b/core/OS.hpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include diff --git a/core/SHA512.cpp b/core/SHA512.cpp index f6b8ba8c9..ee28ed8bc 100644 --- a/core/SHA512.cpp +++ b/core/SHA512.cpp @@ -122,7 +122,7 @@ static void sha512_process(sha512_state *const md,const uint8_t *in,unsigned lon inlen -= 128; } else { unsigned long n = std::min(inlen,(128 - md->curlen)); - memcpy(md->buf + md->curlen,in,n); + Utils::copy(md->buf + md->curlen,in,n); md->curlen += n; in += n; inlen -= n; @@ -179,7 +179,7 @@ void SHA384(void *digest,const void *data,unsigned int len) sha384_init(&state); sha512_process(&state,(uint8_t *)data,(unsigned long)len); sha512_done(&state,tmp); - memcpy(digest,tmp,48); + Utils::copy<48>(digest,tmp); } void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,unsigned int len1) @@ -190,7 +190,7 @@ void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,u sha512_process(&state,(uint8_t *)data0,(unsigned long)len0); sha512_process(&state,(uint8_t *)data1,(unsigned long)len1); sha512_done(&state,tmp); - memcpy(digest,tmp,48); + Utils::copy<48>(digest,tmp); } #endif // !ZT_HAVE_NATIVE_SHA512 diff --git a/core/Utils.hpp b/core/Utils.hpp index b7a5d351a..ef5b28970 100644 --- a/core/Utils.hpp +++ b/core/Utils.hpp @@ -55,7 +55,6 @@ namespace Utils { #define ZT_ROL32(x, r) (((x) << (r)) | ((x) >> (32 - (r)))) #ifdef ZT_ARCH_X64 - struct CPUIDRegisters { CPUIDRegisters() noexcept; @@ -70,7 +69,6 @@ struct CPUIDRegisters bool sha; bool fsrm; }; - extern const CPUIDRegisters CPUID; #endif @@ -104,7 +102,9 @@ extern const uint64_t s_mapNonce; */ static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noexcept { -#ifndef __WINDOWS__ +#ifdef __WINDOWS__ + VirtualLock(p, l); +#else mlock(p, l); #endif } @@ -117,7 +117,9 @@ static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noex */ static ZT_INLINE void memoryUnlock(const void *const p, const unsigned int l) noexcept { -#ifndef __WINDOWS__ +#ifdef __WINDOWS__ + VirtualUnlock(p, l); +#else munlock(p, l); #endif } @@ -695,6 +697,23 @@ static ZT_INLINE void storeLittleEndian(void *const p, const I i) noexcept #endif } +/* + * Note on copy() and zero(): + * + * On X64, rep/movsb and rep/stosb are almost always faster for small memory + * regions on all but the oldest microarchitectures (and even there the + * difference is not large). While more aggressive memcpy() implementations + * may be faster in micro-benchmarks, these fail to account for real world + * context such as instruction cache and pipeline pressure. A simple + * instruction like rep/movsb takes up only a few spots in caches and pipelines + * and requires no branching or function calls. Specialized memcpy() can still + * be faster for large memory regions, but ZeroTier doesn't copy anything + * much larger than 16KiB. + * + * A templated version for statically known sizes is provided since this can + * allow some nice optimizations in some cases. + */ + /** * Copy memory block whose size is known at compile time. * @@ -706,13 +725,44 @@ template< unsigned long L > static ZT_INLINE void copy(void *dest, const void *src) noexcept { #if defined(ZT_ARCH_X64) && defined(__GNUC__) - unsigned long l = L; + uintptr_t l = L; asm volatile ("cld ; rep movsb" : "+c"(l), "+S"(src), "+D"(dest)); #else memcpy(dest, src, L); #endif } +// Avoid rep/movsb startup time for some small common sizes. +template<> +ZT_INLINE void copy<4>(void *dest, const void *src) noexcept +{ + *reinterpret_cast(dest) = *reinterpret_cast(src); +} +template<> +ZT_INLINE void copy<8>(void *dest, const void *src) noexcept +{ + *reinterpret_cast(dest) = *reinterpret_cast(src); +} +template<> +ZT_INLINE void copy<12>(void *dest, const void *src) noexcept +{ + *reinterpret_cast(dest) = *reinterpret_cast(src); + *reinterpret_cast(reinterpret_cast(dest) + 8) = *reinterpret_cast(reinterpret_cast(src) + 8); +} +template<> +ZT_INLINE void copy<16>(void *dest, const void *src) noexcept +{ + *reinterpret_cast(dest) = *reinterpret_cast(src); + *reinterpret_cast(reinterpret_cast(dest) + 8) = *reinterpret_cast(reinterpret_cast(src) + 8); +} +template<> +ZT_INLINE void copy<24>(void *dest, const void *src) noexcept +{ + *reinterpret_cast(dest) = *reinterpret_cast(src); + *reinterpret_cast(reinterpret_cast(dest) + 8) = *reinterpret_cast(reinterpret_cast(src) + 8); + *reinterpret_cast(reinterpret_cast(dest) + 16) = *reinterpret_cast(reinterpret_cast(src) + 16); +} + /** * Copy memory block whose size is known at run time * @@ -739,7 +789,7 @@ template< unsigned long L > static ZT_INLINE void zero(void *dest) noexcept { #if defined(ZT_ARCH_X64) && defined(__GNUC__) - unsigned long l = L; + uintptr_t l = L; asm volatile ("cld ; rep stosb" :"+c" (l), "+D" (dest) : "a" (0)); #else memset(dest, 0, L);