Peer metrics (#1995)

* Adding peer metrics

still need to be wired up for use

* per peer packet metrics

* Fix crash from bad instantiation of histogram

* separate alive & dead path counts

* Add peer metric update block

* add peer latency values in doPingAndKeepalive

* prevent deadlock

* peer latency histogram actually works now

* cleanup

* capture counts of packets to specific peers

---------

Co-authored-by: Joseph Henry <joseph.henry@zerotier.com>
This commit is contained in:
Grant Limberg 2023-05-04 07:58:02 -07:00 committed by GitHub
parent 925599cab0
commit 74dc41c7c7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 102 additions and 53 deletions

View file

@ -28,15 +28,9 @@ namespace prometheus {
/// a data race. /// a data race.
template <typename Value_ = uint64_t> template <typename Value_ = uint64_t>
class Histogram : public Metric { class Histogram : public Metric {
using BucketBoundaries = std::vector<Value_>;
const BucketBoundaries bucket_boundaries_;
std::vector<Counter<Value_>> bucket_counts_;
Gauge<Value_> sum_;
public: public:
using Value = Value_; using Value = Value_;
using BucketBoundaries = std::vector<Value_>;
using Family = CustomFamily<Histogram<Value>>; using Family = CustomFamily<Histogram<Value>>;
static const Metric::Type static_type = Metric::Type::Histogram; static const Metric::Type static_type = Metric::Type::Histogram;
@ -69,7 +63,7 @@ namespace prometheus {
bucket_boundaries_.begin(), bucket_boundaries_.begin(),
std::find_if( std::find_if(
std::begin(bucket_boundaries_), std::end(bucket_boundaries_), std::begin(bucket_boundaries_), std::end(bucket_boundaries_),
[value](const double boundary) { return boundary >= value; }))); [value](const Value boundary) { return boundary >= value; })));
sum_.Increment(value); sum_.Increment(value);
bucket_counts_[bucket_index].Increment(); bucket_counts_[bucket_index].Increment();
} }
@ -110,7 +104,7 @@ namespace prometheus {
bucket.cumulative_count = cumulative_count; bucket.cumulative_count = cumulative_count;
bucket.upper_bound = (i == bucket_boundaries_.size() bucket.upper_bound = (i == bucket_boundaries_.size()
? std::numeric_limits<double>::infinity() ? std::numeric_limits<double>::infinity()
: bucket_boundaries_[i]); : static_cast<double>(bucket_boundaries_[i]));
metric.histogram.bucket.push_back(std::move(bucket)); metric.histogram.bucket.push_back(std::move(bucket));
} }
metric.histogram.sample_count = cumulative_count; metric.histogram.sample_count = cumulative_count;
@ -119,6 +113,12 @@ namespace prometheus {
return metric; return metric;
} }
private:
const BucketBoundaries bucket_boundaries_;
std::vector<Counter<Value_>> bucket_counts_;
Gauge<Value_> sum_;
}; };
/// \brief Return a builder to configure and register a Histogram metric. /// \brief Return a builder to configure and register a Histogram metric.

View file

@ -176,6 +176,22 @@ namespace ZeroTier {
prometheus::simpleapi::counter_family_t network_outgoing_packets prometheus::simpleapi::counter_family_t network_outgoing_packets
{ "zt_network_outgoing_packets", "number of outgoing packets per network" }; { "zt_network_outgoing_packets", "number of outgoing packets per network" };
// PeerMetrics
prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency =
prometheus::Builder<prometheus::Histogram<uint64_t>>()
.Name("zt_peer_latency")
.Help("peer latency (ms)")
.Register(prometheus::simpleapi::registry);
prometheus::simpleapi::gauge_family_t peer_path_count
{ "zt_peer_path_count", "number of paths to peer" };
prometheus::simpleapi::counter_family_t peer_incoming_packets
{ "zt_peer_incoming_packets", "number of incoming packets from a peer" };
prometheus::simpleapi::counter_family_t peer_outgoing_packets
{ "zt_peer_outgoing_packets", "number of outgoing packets to a peer" };
prometheus::simpleapi::counter_family_t peer_packet_errors
{ "zt_peer_packet_errors" , "number of incoming packet errors from a peer" };
// General Controller Metrics // General Controller Metrics
prometheus::simpleapi::gauge_metric_t network_count prometheus::simpleapi::gauge_metric_t network_count
{"controller_network_count", "number of networks the controller is serving"}; {"controller_network_count", "number of networks the controller is serving"};

View file

@ -107,6 +107,13 @@ namespace ZeroTier {
extern prometheus::simpleapi::counter_family_t network_incoming_packets; extern prometheus::simpleapi::counter_family_t network_incoming_packets;
extern prometheus::simpleapi::counter_family_t network_outgoing_packets; extern prometheus::simpleapi::counter_family_t network_outgoing_packets;
// Peer Metrics
extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency;
extern prometheus::simpleapi::gauge_family_t peer_path_count;
extern prometheus::simpleapi::counter_family_t peer_incoming_packets;
extern prometheus::simpleapi::counter_family_t peer_outgoing_packets;
extern prometheus::simpleapi::counter_family_t peer_packet_errors;
// General Controller Metrics // General Controller Metrics
extern prometheus::simpleapi::gauge_metric_t network_count; extern prometheus::simpleapi::gauge_metric_t network_count;
extern prometheus::simpleapi::gauge_metric_t member_count; extern prometheus::simpleapi::gauge_metric_t member_count;

View file

@ -28,11 +28,6 @@ namespace ZeroTier {
static unsigned char s_freeRandomByteCounter = 0; static unsigned char s_freeRandomByteCounter = 0;
char * peerIDString(const Identity &id) {
char out[16];
return id.address().toString(out);
}
Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) : Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) :
RR(renv), RR(renv),
_lastReceive(0), _lastReceive(0),
@ -55,7 +50,13 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
_directPathPushCutoffCount(0), _directPathPushCutoffCount(0),
_echoRequestCutoffCount(0), _echoRequestCutoffCount(0),
_localMultipathSupported(false), _localMultipathSupported(false),
_lastComputedAggregateMeanLatency(0) _lastComputedAggregateMeanLatency(0),
_peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})},
_alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})},
_dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})},
_incoming_packet{Metrics::peer_incoming_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
_outgoing_packet{Metrics::peer_outgoing_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
_packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
{ {
if (!myIdentity.agree(peerIdentity,_key)) { if (!myIdentity.agree(peerIdentity,_key)) {
throw ZT_EXCEPTION_INVALID_ARGUMENT; throw ZT_EXCEPTION_INVALID_ARGUMENT;
@ -96,7 +97,7 @@ void Peer::received(
default: default:
break; break;
} }
_incoming_packet++;
recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now); recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
if (trustEstablished) { if (trustEstablished) {
@ -519,54 +520,70 @@ void Peer::performMultipathStateCheck(void *tPtr, int64_t now)
unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now) unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
{ {
unsigned int sent = 0; unsigned int sent = 0;
Mutex::Lock _l(_paths_m); {
Mutex::Lock _l(_paths_m);
performMultipathStateCheck(tPtr, now); performMultipathStateCheck(tPtr, now);
const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD); const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
if (sendFullHello) { if (sendFullHello) {
_lastSentFullHello = now; _lastSentFullHello = now;
}
// Right now we only keep pinging links that have the maximum priority. The
// priority is used to track cluster redirections, meaning that when a cluster
// redirects us its redirect target links override all other links and we
// let those old links expire.
long maxPriority = 0;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
maxPriority = std::max(_paths[i].priority,maxPriority);
} else {
break;
} }
}
bool deletionOccurred = false; // Right now we only keep pinging links that have the maximum priority. The
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { // priority is used to track cluster redirections, meaning that when a cluster
if (_paths[i].p) { // redirects us its redirect target links override all other links and we
// Clean expired and reduced priority paths // let those old links expire.
if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) { long maxPriority = 0;
if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) { for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello); if (_paths[i].p) {
_paths[i].p->sent(now); maxPriority = std::max(_paths[i].priority,maxPriority);
sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
}
} else { } else {
_paths[i] = _PeerPath(); break;
deletionOccurred = true;
} }
} }
if (!_paths[i].p || deletionOccurred) {
for(unsigned int j=i;j<ZT_MAX_PEER_NETWORK_PATHS;++j) { bool deletionOccurred = false;
if (_paths[j].p && i != j) { for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
_paths[i] = _paths[j]; if (_paths[i].p) {
_paths[j] = _PeerPath(); // Clean expired and reduced priority paths
break; if ( ((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION) && (_paths[i].priority == maxPriority) ) {
if ((sendFullHello)||(_paths[i].p->needsHeartbeat(now))) {
attemptToContactAt(tPtr,_paths[i].p->localSocket(),_paths[i].p->address(),now,sendFullHello);
_paths[i].p->sent(now);
sent |= (_paths[i].p->address().ss_family == AF_INET) ? 0x1 : 0x2;
}
} else {
_paths[i] = _PeerPath();
deletionOccurred = true;
} }
} }
deletionOccurred = false; if (!_paths[i].p || deletionOccurred) {
for(unsigned int j=i;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
if (_paths[j].p && i != j) {
_paths[i] = _paths[j];
_paths[j] = _PeerPath();
break;
}
}
deletionOccurred = false;
}
} }
uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
if (_paths[i].p->alive(now)) {
alive_path_count_tmp++;
}
else {
dead_path_count_tmp++;
}
}
}
_alive_path_count = alive_path_count_tmp;
_dead_path_count = dead_path_count_tmp;
} }
_peer_latency.Observe(latency(now));
return sent; return sent;
} }
@ -641,6 +658,7 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId, void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now) uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
{ {
_outgoing_packet++;
if (_localMultipathSupported && _bond) { if (_localMultipathSupported && _bond) {
_bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now); _bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
} }
@ -648,6 +666,7 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path) void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
{ {
_packet_errors++;
if (_localMultipathSupported && _bond) { if (_localMultipathSupported && _bond) {
_bond->recordIncomingInvalidPacket(path); _bond->recordIncomingInvalidPacket(path);
} }

View file

@ -598,6 +598,13 @@ private:
int32_t _lastComputedAggregateMeanLatency; int32_t _lastComputedAggregateMeanLatency;
SharedPtr<Bond> _bond; SharedPtr<Bond> _bond;
prometheus::Histogram<uint64_t> &_peer_latency;
prometheus::simpleapi::gauge_metric_t _alive_path_count;
prometheus::simpleapi::gauge_metric_t _dead_path_count;
prometheus::simpleapi::counter_metric_t _incoming_packet;
prometheus::simpleapi::counter_metric_t _outgoing_packet;
prometheus::simpleapi::counter_metric_t _packet_errors;
}; };
} // namespace ZeroTier } // namespace ZeroTier