mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-06-05 20:13:44 +02:00
Some cleanup and optimization.
This commit is contained in:
parent
e213317532
commit
bd0299f392
3 changed files with 60 additions and 9 deletions
|
@ -39,6 +39,7 @@
|
|||
#include <WinSock2.h>
|
||||
#include <ws2tcpip.h>
|
||||
#include <Windows.h>
|
||||
#include <memoryapi.h>
|
||||
#include <shlwapi.h>
|
||||
#include <Shlobj.h>
|
||||
#include <sys/param.h>
|
||||
|
|
|
@ -122,7 +122,7 @@ static void sha512_process(sha512_state *const md,const uint8_t *in,unsigned lon
|
|||
inlen -= 128;
|
||||
} else {
|
||||
unsigned long n = std::min(inlen,(128 - md->curlen));
|
||||
memcpy(md->buf + md->curlen,in,n);
|
||||
Utils::copy(md->buf + md->curlen,in,n);
|
||||
md->curlen += n;
|
||||
in += n;
|
||||
inlen -= n;
|
||||
|
@ -179,7 +179,7 @@ void SHA384(void *digest,const void *data,unsigned int len)
|
|||
sha384_init(&state);
|
||||
sha512_process(&state,(uint8_t *)data,(unsigned long)len);
|
||||
sha512_done(&state,tmp);
|
||||
memcpy(digest,tmp,48);
|
||||
Utils::copy<48>(digest,tmp);
|
||||
}
|
||||
|
||||
void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,unsigned int len1)
|
||||
|
@ -190,7 +190,7 @@ void SHA384(void *digest,const void *data0,unsigned int len0,const void *data1,u
|
|||
sha512_process(&state,(uint8_t *)data0,(unsigned long)len0);
|
||||
sha512_process(&state,(uint8_t *)data1,(unsigned long)len1);
|
||||
sha512_done(&state,tmp);
|
||||
memcpy(digest,tmp,48);
|
||||
Utils::copy<48>(digest,tmp);
|
||||
}
|
||||
|
||||
#endif // !ZT_HAVE_NATIVE_SHA512
|
||||
|
|
|
@ -55,7 +55,6 @@ namespace Utils {
|
|||
#define ZT_ROL32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
|
||||
|
||||
#ifdef ZT_ARCH_X64
|
||||
|
||||
struct CPUIDRegisters
|
||||
{
|
||||
CPUIDRegisters() noexcept;
|
||||
|
@ -70,7 +69,6 @@ struct CPUIDRegisters
|
|||
bool sha;
|
||||
bool fsrm;
|
||||
};
|
||||
|
||||
extern const CPUIDRegisters CPUID;
|
||||
#endif
|
||||
|
||||
|
@ -104,7 +102,9 @@ extern const uint64_t s_mapNonce;
|
|||
*/
|
||||
static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noexcept
|
||||
{
|
||||
#ifndef __WINDOWS__
|
||||
#ifdef __WINDOWS__
|
||||
VirtualLock(p, l);
|
||||
#else
|
||||
mlock(p, l);
|
||||
#endif
|
||||
}
|
||||
|
@ -117,7 +117,9 @@ static ZT_INLINE void memoryLock(const void *const p, const unsigned int l) noex
|
|||
*/
|
||||
static ZT_INLINE void memoryUnlock(const void *const p, const unsigned int l) noexcept
|
||||
{
|
||||
#ifndef __WINDOWS__
|
||||
#ifdef __WINDOWS__
|
||||
VirtualUnlock(p, l);
|
||||
#else
|
||||
munlock(p, l);
|
||||
#endif
|
||||
}
|
||||
|
@ -695,6 +697,23 @@ static ZT_INLINE void storeLittleEndian(void *const p, const I i) noexcept
|
|||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Note on copy() and zero():
|
||||
*
|
||||
* On X64, rep/movsb and rep/stosb are almost always faster for small memory
|
||||
* regions on all but the oldest microarchitectures (and even there the
|
||||
* difference is not large). While more aggressive memcpy() implementations
|
||||
* may be faster in micro-benchmarks, these fail to account for real world
|
||||
* context such as instruction cache and pipeline pressure. A simple
|
||||
* instruction like rep/movsb takes up only a few spots in caches and pipelines
|
||||
* and requires no branching or function calls. Specialized memcpy() can still
|
||||
* be faster for large memory regions, but ZeroTier doesn't copy anything
|
||||
* much larger than 16KiB.
|
||||
*
|
||||
* A templated version for statically known sizes is provided since this can
|
||||
* allow some nice optimizations in some cases.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Copy memory block whose size is known at compile time.
|
||||
*
|
||||
|
@ -706,13 +725,44 @@ template< unsigned long L >
|
|||
static ZT_INLINE void copy(void *dest, const void *src) noexcept
|
||||
{
|
||||
#if defined(ZT_ARCH_X64) && defined(__GNUC__)
|
||||
unsigned long l = L;
|
||||
uintptr_t l = L;
|
||||
asm volatile ("cld ; rep movsb" : "+c"(l), "+S"(src), "+D"(dest));
|
||||
#else
|
||||
memcpy(dest, src, L);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Avoid rep/movsb startup time for some small common sizes.
|
||||
template<>
|
||||
ZT_INLINE void copy<4>(void *dest, const void *src) noexcept
|
||||
{
|
||||
*reinterpret_cast<uint32_t *>(dest) = *reinterpret_cast<const uint32_t *>(src);
|
||||
}
|
||||
template<>
|
||||
ZT_INLINE void copy<8>(void *dest, const void *src) noexcept
|
||||
{
|
||||
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
|
||||
}
|
||||
template<>
|
||||
ZT_INLINE void copy<12>(void *dest, const void *src) noexcept
|
||||
{
|
||||
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
|
||||
*reinterpret_cast<uint32_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint32_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
|
||||
}
|
||||
template<>
|
||||
ZT_INLINE void copy<16>(void *dest, const void *src) noexcept
|
||||
{
|
||||
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
|
||||
*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
|
||||
}
|
||||
template<>
|
||||
ZT_INLINE void copy<24>(void *dest, const void *src) noexcept
|
||||
{
|
||||
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
|
||||
*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 8) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 8);
|
||||
*reinterpret_cast<uint64_t *>(reinterpret_cast<uint8_t *>(dest) + 16) = *reinterpret_cast<const uint64_t *>(reinterpret_cast<const uint8_t *>(src) + 16);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy memory block whose size is known at run time
|
||||
*
|
||||
|
@ -739,7 +789,7 @@ template< unsigned long L >
|
|||
static ZT_INLINE void zero(void *dest) noexcept
|
||||
{
|
||||
#if defined(ZT_ARCH_X64) && defined(__GNUC__)
|
||||
unsigned long l = L;
|
||||
uintptr_t l = L;
|
||||
asm volatile ("cld ; rep stosb" :"+c" (l), "+D" (dest) : "a" (0));
|
||||
#else
|
||||
memset(dest, 0, L);
|
||||
|
|
Loading…
Add table
Reference in a new issue