Merge pull request #1569 from zerotier/dev-echo-rate-gate

Rate gate ECHO per Path instead of per Peer

In multipath scenarios user traffic is used to judge the aliveness of a path. If the user traffic is too infrequent to establish aliveness for a given time window (say 500 ms), the bonding layer will send extra ECHOs at a maximum rate of failoverInterval / 3 (or ~ 166 ms) per path. This patch relaxes the rate-limiting of ECHOs significantly in order to prevent a non-multipath node from dropping ECHOs causing multipath nodes to erroneously judge paths to that node to be dead.
Details

This patch decreases the rate limiting from 1000 ms per peer by a factor of 6 to ~166 ms and rate limits ECHOs per Path instead of per Peer. This allows rate limiting to scale with the number of established paths to a peer.

As a result, if all 64 path slots are used a total of 64 x 6 = 384 ECHOs per second will be allowed in the most aggressive case where failoverInterval is set to 500 ms.
This commit is contained in:
Joseph Henry 2022-02-25 11:23:42 -08:00 committed by GitHub
commit 84705aafc7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 20 additions and 17 deletions

View file

@ -571,13 +571,13 @@
* Anything below this value gets into thrashing territory since we divide
* this value by ZT_BOND_ECHOS_PER_FAILOVER_INTERVAL to send ECHOs often.
*/
#define ZT_BOND_FAILOVER_MIN_INTERVAL 250
#define ZT_BOND_FAILOVER_MIN_INTERVAL 500
/**
* How many times per failover interval that an ECHO is sent. This should be
* at least 2. Anything more then 4 starts to increase overhead significantly.
*/
#define ZT_BOND_ECHOS_PER_FAILOVER_INTERVAL 4
#define ZT_BOND_ECHOS_PER_FAILOVER_INTERVAL 3
/**
* A defensive timer to prevent path quality metrics from being

View file

@ -860,7 +860,7 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,void *tPtr,const
bool IncomingPacket::_doECHO(const RuntimeEnvironment *RR,void *tPtr,const SharedPtr<Peer> &peer)
{
uint64_t now = RR->node->now();
if (!peer->rateGateEchoRequest(now)) {
if (!_path->rateGateEchoRequest(now)) {
return true;
}

View file

@ -83,6 +83,7 @@ public:
_lastOut(0),
_lastIn(0),
_lastTrustEstablishedPacketReceived(0),
_lastEchoRequestReceived(0),
_localSocket(-1),
_latency(0xffff),
_addr(),
@ -93,6 +94,7 @@ public:
_lastOut(0),
_lastIn(0),
_lastTrustEstablishedPacketReceived(0),
_lastEchoRequestReceived(0),
_localSocket(localSocket),
_latency(0xffff),
_addr(addr),
@ -266,6 +268,18 @@ public:
*/
inline int64_t lastTrustEstablishedPacketReceived() const { return _lastTrustEstablishedPacketReceived; }
/**
* Rate limit gate for inbound ECHO requests
*/
inline bool rateGateEchoRequest(const int64_t now)
{
if ((now - _lastEchoRequestReceived) >= (ZT_PEER_GENERAL_RATE_LIMIT / 20)) {
_lastEchoRequestReceived = now;
return true;
}
return false;
}
void *_bondingMetricPtr;
private:
@ -273,6 +287,9 @@ private:
volatile int64_t _lastOut;
volatile int64_t _lastIn;
volatile int64_t _lastTrustEstablishedPacketReceived;
int64_t _lastEchoRequestReceived;
int64_t _localSocket;
volatile unsigned int _latency;
InetAddress _addr;

View file

@ -34,7 +34,6 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
_lastTriedMemorizedPath(0),
_lastDirectPathPushSent(0),
_lastDirectPathPushReceive(0),
_lastEchoRequestReceived(0),
_lastCredentialRequestSent(0),
_lastWhoisRequestReceived(0),
_lastCredentialsReceived(0),

View file

@ -418,18 +418,6 @@ public:
return false;
}
/**
* Rate limit gate for inbound ECHO requests
*/
inline bool rateGateEchoRequest(const int64_t now)
{
if ((now - _lastEchoRequestReceived) >= ZT_PEER_GENERAL_RATE_LIMIT) {
_lastEchoRequestReceived = now;
return true;
}
return false;
}
/**
* Serialize a peer for storage in local cache
*
@ -546,7 +534,6 @@ private:
int64_t _lastTriedMemorizedPath;
int64_t _lastDirectPathPushSent;
int64_t _lastDirectPathPushReceive;
int64_t _lastEchoRequestReceived;
int64_t _lastCredentialRequestSent;
int64_t _lastWhoisRequestReceived;
int64_t _lastCredentialsReceived;