Some perf stuff, docs, HELLO design tweaks for ephemeral keys.

This commit is contained in:
Adam Ierymenko 2020-03-27 13:09:37 -07:00
parent 1f2e94a51d
commit 369df245e3
No known key found for this signature in database
GPG key ID: C8877CF2D7A5D7F3
3 changed files with 147 additions and 243 deletions

BIN
doc/2015-GCM-SIV.pdf Normal file

Binary file not shown.

View file

@ -252,17 +252,17 @@
/**
* HELLO exchange meta-data: ephemeral C25519 public key
*/
#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_KEY_C25519 "e0"
#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_C25519 "e0"
/**
* HELLO exchange meta-data: ephemeral NIST P-384 public key
*/
#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_KEY_P384 "e1"
#define ZT_PROTO_HELLO_NODE_META_EPHEMERAL_P384 "e1"
/**
* HELLO exchange meta-data: address(es) of nodes to whom this node will relay
*/
#define ZT_PROTO_HELLO_NODE_META_WILL_RELAY_TO "wr"
#define ZT_PROTO_HELLO_NODE_META_NEIGHBORS "wr"
/**
* HELLO exchange meta-data: X coordinate of your node (sent in OK(HELLO))
@ -304,53 +304,93 @@ enum Verb
/**
* Announcement of a node's existence and vitals:
* <[1] protocol version>
* <[1] software major version>
* <[1] software minor version>
* <[2] software revision>
* <[8] timestamp for determining latency>
* <[1] software major version (LEGACY)>
* <[1] software minor version (LEGACY)>
* <[2] software revision (LEGACY)>
* <[8] timestamp for determining latency (LEGACY)>
* <[...] binary serialized identity>
* <[...] physical destination address of packet>
* [... begin encrypted region ...]
* <[2] 16-bit reserved (legacy) field, always 0>
* <[2] 16-bit length of meta-data dictionary>
* <[...] meta-data dictionary>
* <[2] 16-bit length of any additional fields>
* [... end encrypted region ...]
* <[48] HMAC-SHA384 of packet (with hops field masked to 0)>
* <[...] physical destination address of packet (LEGACY)>
* <[2] 16-bit reserved "encrypted zero" field (LEGACY)>
* <[...] encrypted dictionary>
* <[2] 16-bit length of preceding encrypted dictionary>
* <[48] HMAC-SHA384 of plaintext packet (with hops masked to 0)>
*
* HELLO is sent using the POLY1305_NONE cipher setting (MAC but
* no encryption) and as of protocol version 11 contains an extra
* HMAC-SHA384 MAC for additional authentication hardening.
* HELLO is sent to initiate a new pairing between two nodes.
*
* The physical desgination address is the raw InetAddress to which the
* packet was sent, regardless of any relaying used.
* HELLO is the only packet ever sent without normal payload encryption,
* though an inner encrypted envelope exists to obscure all fields that
* do not need to be sent in the clear. HELLO's MAC field contains a
* Poly1305 MAC for backward compatibility, and v2.x adds an additional
* HMAC-SHA384 at the end for stronger authentication of sessions. HELLO
* authentication is performed using the long-lived identity key only,
* and the encryption of the inner dictionary field is done using a key
* derived from this identity key explicitly for this purpose.
*
* HELLO packets have an encrypted section that is encrypted with
* Salsa20/12 using the two peers' long-term negotiated keys and with
* the packet ID (with least significant 3 bits masked to 0 for legacy
* reasons) as the Salsa20/12 IV. This encryption is technically not
* necessary but serves to protect the privacy of locators and other
* fields for a little added defense in depth. Note to auditors: for FIPS
* or other auditing purposes this crypto can be ignored as its
* compromise poses no risk to peer or network authentication or transport
* data privacy. HMAC is computed after this encryption is performed and
* is verified before decryption is performed.
* The main payload of HELLO is the protocol version and the full identity
* of the sender, which includes the sender's public key(s). An encrypted
* dictionary (key/value store) is also included for additional information.
* This is encrypted using AES-CTR with a derived key and using the final
* 96 bits of the packet's HMAC-SHA384 as the CTR IV. (The HMAC authenticates
* the packet prior to this field being encrypted, making this a SIV
* construction much like AES-GMAC-SIV.)
*
* The length of the dictionary field is included immediately after it so
* that it can be decrypted and the HMAC validated without performing any
* parsing of anything else, since it's a good idea to authenticate any
* message as early as possible in any secure protocol.
*
* V1.x will ignore the HMAC and dictionary fields as it doesn't understand
* them, but the packet is constructed so that 1.x nodes will parse what
* they need to communicate with 2.x nodes (without forward secrecy) as long
* as we wish to support this.
*
* Several legacy fields are present as well for the benefit of 1.x nodes.
* These will go away and become simple reserved space once 1.x is no longer
* supported. Some are self-explanatory. The "encrypted zero" is rather
* strange. It's a 16-bit zero value encrypted using Salsa20/12 and the
* long-lived identity key shared by the two peers. It tells 1.x that an
* old encrypted field is no longer there and that it should stop parsing
* the packet at that point.
*
* The following fields are nearly always present and must exist to support
* forward secrecy (in the case of the instance ID, keys, and key revision)
* or federated root membership (in the case of the locator).
*
* TIMESTAMP - node's timestamp in milliseconds (supersedes legacy field)
* INSTANCE_ID - a 64-bit unique value generated on each node start
* EPHEMERAL_C25519 - an ephemeral Curve25519 public key
* EPHEMERAL_P384 - an ephemeral NIST P-384 public key
* EPHEMERAL_REVISION - 64-bit monotonically increasing per-instance counter
* LOCATOR - signed record enumerating this node's trusted contact points
*
* The following optional fields may also be present:
*
* NAME - abitrary short user-defined name for this node
* CONTACT - arbitrary short contact information string for this node
* NEIGHBORS - addresses of node(s) to whom we'll relay (mesh-like routing)
* LOC_X, LOC_Y, LOC_Z - location relative to the nearest large center of mass
* PEER_LOC_X, PEER_LOC_Y, PEER_LOC_Z - where sender thinks peer is located
* SOFTWARE_VENDOR - short name or description of vendor, such as a URL
* SOFTWARE_VERSION - major, minor, revision, and build, and 16-bit integers
* PHYSICAL_DEST - serialized Endpoint to which this message was sent
* VIRTUAL_DEST - ZeroTier address of first hop (if first hop wasn't destination)
* COMPLIANCE - bit mask containing bits for e.g. a FIPS-compliant node
*
* A valid and successfully authenticated HELLO will generate the following
* OK response which contains much of the same information about the
* responding peer.
* OK response. It contains an echo of the timestamp supplied by the
* initiating peer, the protocol version, and a dictionary containing
* the same information about the responding peer as the originating peer
* sent.
*
* OK payload:
* <[8] timestamp echoed from original HELLO packet>
* <[8] timestamp echoed from original HELLO>
* <[1] protocol version>
* <[1] software major version>
* <[1] software minor version>
* <[2] software revision>
* <[...] physical destination address of packet>
* <[2] 16-bit reserved (legacy) field, currently must be 0>
* <[2] 16-bit length of meta-data dictionary>
* <[...] meta-data dictionary>
* <[2] 16-bit length of any additional fields>
* <[1] software major version (LEGACY)>
* <[1] software minor version (LEGACY)>
* <[2] software revision (LEGACY)>
* <[...] physical destination address of packet (LEGACY)>
* <[2] 16-bit reserved zero field (LEGACY)>
* <[...] dictionary>
* <[48] HMAC-SHA384 of plaintext packet (with hops masked to 0)>
*/
VERB_HELLO = 0x01,

View file

@ -583,99 +583,18 @@ static ZT_INLINE void storeLittleEndian(void *const p,const I i) noexcept
#endif
}
template<unsigned int L>
static ZT_INLINE void copy(void *dest,const void *src) noexcept;
template<>
ZT_INLINE void copy<64>(void *const dest,const void *const src) noexcept
{
#ifdef ZT_ARCH_X64
__m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
__m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 2);
__m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),a);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,b);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2,c);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 3,d);
#else
uint64_t a = reinterpret_cast<const uint64_t *>(src)[0];
uint64_t b = reinterpret_cast<const uint64_t *>(src)[1];
uint64_t c = reinterpret_cast<const uint64_t *>(src)[2];
uint64_t d = reinterpret_cast<const uint64_t *>(src)[3];
uint64_t e = reinterpret_cast<const uint64_t *>(src)[4];
uint64_t f = reinterpret_cast<const uint64_t *>(src)[5];
uint64_t g = reinterpret_cast<const uint64_t *>(src)[6];
uint64_t h = reinterpret_cast<const uint64_t *>(src)[7];
reinterpret_cast<uint64_t *>(dest)[0] = a;
reinterpret_cast<uint64_t *>(dest)[1] = b;
reinterpret_cast<uint64_t *>(dest)[2] = c;
reinterpret_cast<uint64_t *>(dest)[3] = d;
reinterpret_cast<uint64_t *>(dest)[4] = e;
reinterpret_cast<uint64_t *>(dest)[5] = f;
reinterpret_cast<uint64_t *>(dest)[6] = g;
reinterpret_cast<uint64_t *>(dest)[7] = h;
#endif
}
template<>
ZT_INLINE void copy<32>(void *const dest,const void *const src) noexcept
{
#ifdef ZT_ARCH_X64
__m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),a);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,b);
#else
uint64_t a = reinterpret_cast<const uint64_t *>(src)[0];
uint64_t b = reinterpret_cast<const uint64_t *>(src)[1];
uint64_t c = reinterpret_cast<const uint64_t *>(src)[2];
uint64_t d = reinterpret_cast<const uint64_t *>(src)[3];
reinterpret_cast<uint64_t *>(dest)[0] = a;
reinterpret_cast<uint64_t *>(dest)[1] = b;
reinterpret_cast<uint64_t *>(dest)[2] = c;
reinterpret_cast<uint64_t *>(dest)[3] = d;
#endif
}
template<>
ZT_INLINE void copy<16>(void *const dest,const void *const src) noexcept
{
#ifdef ZT_ARCH_X64
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
#else
uint64_t a = reinterpret_cast<const uint64_t *>(src)[0];
uint64_t b = reinterpret_cast<const uint64_t *>(src)[1];
reinterpret_cast<uint64_t *>(dest)[0] = a;
reinterpret_cast<uint64_t *>(dest)[1] = b;
#endif
}
template<>
ZT_INLINE void copy<8>(void *const dest,const void *const src) noexcept
{
*reinterpret_cast<uint64_t *>(dest) = *reinterpret_cast<const uint64_t *>(src);
}
template<>
ZT_INLINE void copy<4>(void *const dest,const void *const src) noexcept
{
*reinterpret_cast<uint32_t *>(dest) = *reinterpret_cast<const uint32_t *>(src);
}
template<>
ZT_INLINE void copy<2>(void *const dest,const void *const src) noexcept
{
*reinterpret_cast<uint16_t *>(dest) = *reinterpret_cast<const uint16_t *>(src);
}
template<>
ZT_INLINE void copy<1>(void *const dest,const void *const src) noexcept
{
*reinterpret_cast<uint8_t *>(dest) = *reinterpret_cast<const uint8_t *>(src);
}
template<>
ZT_INLINE void copy<0>(void *const dest,const void *const src) noexcept
{
}
/**
* Copy memory block whose size is known at compile time
*
* @tparam L Size of memory
* @param dest Destination memory
* @param src Source memory
*/
template<unsigned int L>
static ZT_INLINE void copy(void *const dest,const void *const src) noexcept
{
#ifdef ZT_NO_UNALIGNED_ACCESS
if ((((uintptr_t)dest | (uintptr_t)src) & 7U) != 0) {
if ((((uintptr_t)dest | (uintptr_t)src) & (sizeof(uintptr_t) - 1)) != 0) {
memcpy(dest,src,L);
return;
}
@ -684,154 +603,99 @@ static ZT_INLINE void copy(void *const dest,const void *const src) noexcept
uint8_t *d = reinterpret_cast<uint8_t *>(dest);
const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
#ifdef ZT_ARCH_X64
for(unsigned int i=0;i<(L / 64U);++i) {
copy<64>(d,s);
__m128i x0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
__m128i x1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 1);
__m128i x2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 2);
__m128i x3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 3);
_mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0);
_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 1,x1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 2,x2);
_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 3,x3);
d += 64;
s += 64;
}
if ((L & 63U) >= 32U) {
copy<32>(d,s);
__m128i x0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
__m128i x1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s) + 1);
_mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0);
_mm_storeu_si128(reinterpret_cast<__m128i *>(d) + 1,x1);
d += 32;
s += 32;
}
if ((L & 31U) >= 16U) {
copy<16>(d,s);
__m128i x0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
_mm_storeu_si128(reinterpret_cast<__m128i *>(d),x0);
d += 16;
s += 16;
}
if ((L & 15U) >= 8U) {
copy<8>(d,s);
*reinterpret_cast<uint64_t *>(d) = *reinterpret_cast<const uint64_t *>(s);
d += 8;
s += 8;
}
if ((L & 7U) >= 4U) {
copy<4>(d,s);
*reinterpret_cast<uint32_t *>(d) = *reinterpret_cast<const uint32_t *>(s);
d += 4;
s += 4;
}
if ((L & 3U) >= 2U) {
copy<2>(d,s);
*reinterpret_cast<uint16_t *>(d) = *reinterpret_cast<const uint16_t *>(s);
d += 2;
s += 2;
}
if ((L & 1U) != 0U) {
copy<1>(d,s);
*d = *s;
}
#else
for(unsigned int i=0;i<(L / (sizeof(uintptr_t) * 4));++i) {
uintptr_t x0 = reinterpret_cast<const uintptr_t *>(s)[0];
uintptr_t x1 = reinterpret_cast<const uintptr_t *>(s)[1];
uintptr_t x2 = reinterpret_cast<const uintptr_t *>(s)[2];
uintptr_t x3 = reinterpret_cast<const uintptr_t *>(s)[3];
reinterpret_cast<uintptr_t *>(d)[0] = x0;
reinterpret_cast<uintptr_t *>(d)[1] = x1;
reinterpret_cast<uintptr_t *>(d)[2] = x2;
reinterpret_cast<uintptr_t *>(d)[3] = x3;
s += (sizeof(uintptr_t) * 4);
d += (sizeof(uintptr_t) * 4);
}
for(unsigned int i=0;i<(L & ((sizeof(uintptr_t) * 4) - 1));++i)
d[i] = s[i];
#endif
}
static ZT_INLINE void copy(void *const dest,const void *const src,const unsigned int len) noexcept
/**
* Copy memory block whose size is known at run time
*
* @param dest Destination memory
* @param src Source memory
* @param len Bytes to copy
*/
static ZT_INLINE void copy(void *const dest,const void *const src,unsigned int len) noexcept
{
memcpy(dest,src,len);
}
template<unsigned int L>
static ZT_INLINE void zero(void *dest) noexcept;
template<>
ZT_INLINE void zero<64>(void *const dest) noexcept
{
#ifdef ZT_ARCH_X64
const __m128i z = _mm_setzero_si128();
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),z);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,z);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2,z);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 3,z);
#else
const uint64_t z = 0;
reinterpret_cast<uint64_t *>(dest)[0] = z;
reinterpret_cast<uint64_t *>(dest)[1] = z;
reinterpret_cast<uint64_t *>(dest)[2] = z;
reinterpret_cast<uint64_t *>(dest)[3] = z;
reinterpret_cast<uint64_t *>(dest)[4] = z;
reinterpret_cast<uint64_t *>(dest)[5] = z;
reinterpret_cast<uint64_t *>(dest)[6] = z;
reinterpret_cast<uint64_t *>(dest)[7] = z;
#endif
}
template<>
ZT_INLINE void zero<32>(void *const dest) noexcept
{
#ifdef ZT_ARCH_X64
const __m128i z = _mm_setzero_si128();
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest),z);
_mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1,z);
#else
const uint64_t z = 0;
reinterpret_cast<uint64_t *>(dest)[0] = z;
reinterpret_cast<uint64_t *>(dest)[1] = z;
reinterpret_cast<uint64_t *>(dest)[2] = z;
reinterpret_cast<uint64_t *>(dest)[3] = z;
#endif
}
template<>
ZT_INLINE void zero<16>(void *const dest) noexcept
{
const uint64_t z = 0;
reinterpret_cast<uint64_t *>(dest)[0] = z;
reinterpret_cast<uint64_t *>(dest)[1] = z;
}
template<>
ZT_INLINE void zero<8>(void *const dest) noexcept
{
*reinterpret_cast<uint64_t *>(dest) = 0;
}
template<>
ZT_INLINE void zero<4>(void *const dest) noexcept
{
*reinterpret_cast<uint32_t *>(dest) = 0;
}
template<>
ZT_INLINE void zero<2>(void *const dest) noexcept
{
*reinterpret_cast<uint16_t *>(dest) = 0;
}
template<>
ZT_INLINE void zero<1>(void *const dest) noexcept
{
*reinterpret_cast<uint8_t *>(dest) = 0;
}
template<>
ZT_INLINE void zero<0>(void *const dest) noexcept
{
}
/**
* Zero memory block whose size is known at compile time
*
* @tparam L Size in bytes
* @param dest Memory to zero
*/
template<unsigned int L>
static ZT_INLINE void zero(void *const dest) noexcept
{
#ifdef ZT_NO_UNALIGNED_ACCESS
if ((((uintptr_t)dest | (uintptr_t)src) & 7U) != 0) {
memset(dest,0,L);
return;
}
#endif
uint8_t *d = reinterpret_cast<uint8_t *>(dest);
for(unsigned int i=0;i<(L / 64U);++i) {
zero<64>(d);
d += 64;
}
if ((L & 63U) >= 32U) {
zero<32>(d);
d += 32;
}
if ((L & 31U) >= 16U) {
zero<16>(d);
d += 16;
}
if ((L & 15U) >= 8U) {
zero<8>(d);
d += 8;
}
if ((L & 7U) >= 4U) {
zero<4>(d);
d += 4;
}
if ((L & 3U) >= 2U) {
zero<2>(d);
d += 2;
}
if ((L & 1U) != 0U) {
zero<1>(d);
}
memset(dest,0,L);
}
/**
* Zero memory block whose size is known at run time
*
* @param dest Memory to zero
* @param len Size in bytes
*/
static ZT_INLINE void zero(void *const dest,const unsigned int len) noexcept
{
memset(dest,0,len);