Some cleanup and optimization.

This commit is contained in:
Adam Ierymenko 2020-07-02 10:33:34 -07:00
parent e213317532
commit bd0299f392
No known key found for this signature in database
GPG key ID: C8877CF2D7A5D7F3
3 changed files with 60 additions and 9 deletions

View file

@ -39,6 +39,7 @@
#include <WinSock2.h>
#include <ws2tcpip.h>
#include <Windows.h>
#include <memoryapi.h>
#include <shlwapi.h>
#include <Shlobj.h>
#include <sys/param.h>

View file

@ -122,7 +122,7 @@ static void sha512_process(sha512_state *const md,const uint8_t *in,unsigned lon
inlen -= 128;
} else {
unsigned long n = std::min(inlen,(128 - md->curlen));
memcpy(md->buf + md->curlen,in,n);
Utils::copy(md->buf + md->curlen,in,n);
md->curlen += n;
in += n;
inlen -= n;
@ -179,7 +179,7 @@ void SHA384(void *digest,const void *data,unsigned int len)
sha384_init(&state);
sha512_process(&state,(uint8_t *)data,(unsigned long)len);
sha512_done(&state,tmp);
memcpy(digest,tmp,48);
Utils::copy<48>(digest,tmp);
}
void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,unsigned int len1)
@ -190,7 +190,7 @@ void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,u
sha512_process(&state,(uint8_t *)data0,(unsigned long)len0);
sha512_process(&state,(uint8_t *)data1,(unsigned long)len1);
sha512_done(&state,tmp);
memcpy(digest,tmp,48);
Utils::copy<48>(digest,tmp);
}
#endif // !ZT_HAVE_NATIVE_SHA512

View file

@ -55,7 +55,6 @@ namespace Utils {
#define ZT_ROL32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
#ifdef ZT_ARCH_X64
struct CPUIDRegisters
{
CPUIDRegisters() noexcept;
@ -70,7 +69,6 @@ struct CPUIDRegisters
bool sha;
bool fsrm;
};
extern const CPUIDRegisters CPUID;
#endif
@ -104,7 +102,9 @@ extern const uint64_t s_mapNonce;
*/
static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noexcept
{
#ifndef __WINDOWS__
#ifdef __WINDOWS__
VirtualLock(p, l);
#else
mlock(p, l);
#endif
}
@ -117,7 +117,9 @@ static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noex
*/
static ZT_INLINE void memoryUnlock(const void *const p, const unsigned int l) noexcept
{
#ifndef __WINDOWS__
#ifdef __WINDOWS__
VirtualUnlock(p, l);
#else
munlock(p, l);
#endif
}
@ -695,6 +697,23 @@ static ZT_INLINE void storeLittleEndian(void *const p, const I i) noexcept
#endif
}
/*
* Note on copy() and zero():
*
* On X64, rep/movsb and rep/stosb are almost always faster for small memory
* regions on all but the oldest microarchitectures (and even there the
* difference is not large). While more aggressive memcpy() implementations
* may be faster in micro-benchmarks, these fail to account for real world
* context such as instruction cache and pipeline pressure. A simple
* instruction like rep/movsb takes up only a few spots in caches and pipelines
* and requires no branching or function calls. Specialized memcpy() can still
* be faster for large memory regions, but ZeroTier doesn't copy anything
* much larger than 16KiB.
*
* A templated version for statically known sizes is provided since this can
* allow some nice optimizations in some cases.
*/
/**
* Copy memory block whose size is known at compile time.
*
@ -706,13 +725,44 @@ template< unsigned long L >
static ZT_INLINE void copy(void *dest, const void *src) noexcept
{
#if defined(ZT_ARCH_X64) && defined(__GNUC__)
unsigned long l = L;
uintptr_t l = L;
asm volatile ("cld ; rep movsb" : "+c"(l), "+S"(src), "+D"(dest));
#else
memcpy(dest, src, L);
#endif
}
// Avoid rep/movsb startup time for some small common sizes.
template<>
ZT_INLINE void copy<4>(void *dest, const void *src) noexcept
{
*reinterpret_cast<uint32_t *>(dest) = *reinterpret_cast<const uint32_t *>(src);
}
template<>
ZT_INLINE void copy<8>(void *dest, const void *src) noexcept
{
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
}
template<>
ZT_INLINE void copy<12>(void *dest, const void *src) noexcept
{
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
*reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint32_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
}
template<>
ZT_INLINE void copy<16>(void *dest, const void *src) noexcept
{
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
}
template<>
ZT_INLINE void copy<24>(void *dest, const void *src) noexcept
{
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 16) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 16);
}
/**
* Copy memory block whose size is known at run time
*
@ -739,7 +789,7 @@ template< unsigned long L >
static ZT_INLINE void zero(void *dest) noexcept
{
#if defined(ZT_ARCH_X64) && defined(__GNUC__)
unsigned long l = L;
uintptr_t l = L;
asm volatile ("cld ; rep stosb" :"+c" (l), "+D" (dest) : "a" (0));
#else
memset(dest, 0, L);