Metrics consolidation (#1997)

* Rename zt_packet_incoming -> zt_packet

Also consolidate zt_peer_packets into a single metric with tx and rx labels.  Same for ztc_tcp_data and ztc_udp_data

* Further collapse tcp & udp into metric labels for zt_data

* Fix zt_data metric description

* zt_peer_packets description fix

* Consolidate incoming/outgoing network packets to a single metric

* zt_incoming_packet_error -> zt_packet_error

* Disable peer metrics for central controllers

Can change in the future if needed, but given the traffic our controllers serve, that's going to be a *lot* of data

* Disable peer metrics for controllers pt 2
This commit is contained in:
Grant Limberg 2023-05-04 11:12:55 -07:00 committed by GitHub
parent 74dc41c7c7
commit 00d55fc4b4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 75 additions and 60 deletions

View file

@ -311,7 +311,7 @@ endif
ifeq ($(ZT_CONTROLLER),1)
override CXXFLAGS+=-Wall -Wno-deprecated -std=c++17 -pthread $(INCLUDES) -DNDEBUG $(DEFS)
override LDLIBS+=-Lext/libpqxx-7.7.3/install/ubuntu22.04/lib -lpqxx -lpq ext/hiredis-1.0.2/lib/ubuntu22.04/libhiredis.a ext/redis-plus-plus-1.3.3/install/ubuntu22.04/lib/libredis++.a -lssl -lcrypto
override DEFS+=-DZT_CONTROLLER_USE_LIBPQ
override DEFS+=-DZT_CONTROLLER_USE_LIBPQ -DZT_NO_PEER_METRICS
override INCLUDES+=-I/usr/include/postgresql -Iext/libpqxx-7.7.3/install/ubuntu22.04/include -Iext/hiredis-1.0.2/include/ -Iext/redis-plus-plus-1.3.3/install/ubuntu22.04/include/sw/
endif

View file

@ -25,7 +25,7 @@ namespace ZeroTier {
namespace Metrics {
// Packet Type Counts
prometheus::simpleapi::counter_family_t packets
{ "zt_packet_incoming", "incoming packet type counts"};
{ "zt_packet", "incoming packet type counts"};
// Incoming packets
prometheus::simpleapi::counter_metric_t pkt_nop_in
@ -118,7 +118,7 @@ namespace ZeroTier {
// Packet Error Counts
prometheus::simpleapi::counter_family_t packet_errors
{ "zt_packet_incoming_error", "incoming packet errors"};
{ "zt_packet_error", "incoming packet errors"};
// Incoming Error Counts
prometheus::simpleapi::counter_metric_t pkt_error_obj_not_found_in
@ -157,25 +157,26 @@ namespace ZeroTier {
{ packet_errors.Add({{"error_type", "internal_server_error"}, {"direction", "tx"}}) };
// Data Sent/Received Metrics
prometheus::simpleapi::counter_metric_t udp_send
{ "zt_udp_data_sent", "number of bytes ZeroTier has sent via UDP" };
prometheus::simpleapi::counter_family_t data
{ "zt_data", "number of bytes ZeroTier has transmitted or received" };
prometheus::simpleapi::counter_metric_t udp_recv
{ "zt_udp_data_recv", "number of bytes ZeroTier has received via UDP" };
{ data.Add({{"protocol","udp"},{"direction","rx"}}) };
prometheus::simpleapi::counter_metric_t udp_send
{ data.Add({{"protocol","udp"},{"direction","tx"}}) };
prometheus::simpleapi::counter_metric_t tcp_send
{ "zt_tcp_data_sent", "number of bytes ZeroTier has sent via TCP" };
{ data.Add({{"protocol","tcp"},{"direction", "tx"}}) };
prometheus::simpleapi::counter_metric_t tcp_recv
{ "zt_tcp_data_recv", "number of bytes ZeroTier has received via TCP" };
{ data.Add({{"protocol","tcp"},{"direction", "rx"}}) };
// Network Metrics
prometheus::simpleapi::gauge_metric_t network_num_joined
{ "zt_num_networks", "number of networks this instance is joined to" };
prometheus::simpleapi::gauge_family_t network_num_multicast_groups
{ "zt_network_multcast_groups_subscribed", "number of multicast groups networks are subscribed to" };
prometheus::simpleapi::counter_family_t network_incoming_packets
{ "zt_network_incoming_packets", "number of incoming packets per network" };
prometheus::simpleapi::counter_family_t network_outgoing_packets
{ "zt_network_outgoing_packets", "number of outgoing packets per network" };
{ "zt_network_multicast_groups_subscribed", "number of multicast groups networks are subscribed to" };
prometheus::simpleapi::counter_family_t network_packets
{ "zt_network_packets", "number of incoming/outgoing packets per network" };
#ifndef ZT_NO_PEER_METRICS
// PeerMetrics
prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency =
prometheus::Builder<prometheus::Histogram<uint64_t>>()
@ -185,12 +186,11 @@ namespace ZeroTier {
prometheus::simpleapi::gauge_family_t peer_path_count
{ "zt_peer_path_count", "number of paths to peer" };
prometheus::simpleapi::counter_family_t peer_incoming_packets
{ "zt_peer_incoming_packets", "number of incoming packets from a peer" };
prometheus::simpleapi::counter_family_t peer_outgoing_packets
{ "zt_peer_outgoing_packets", "number of outgoing packets to a peer" };
prometheus::simpleapi::counter_family_t peer_packets
{ "zt_peer_packets", "number of packets to/from a peer" };
prometheus::simpleapi::counter_family_t peer_packet_errors
{ "zt_peer_packet_errors" , "number of incoming packet errors from a peer" };
#endif
// General Controller Metrics
prometheus::simpleapi::gauge_metric_t network_count

View file

@ -96,23 +96,24 @@ namespace ZeroTier {
extern prometheus::simpleapi::counter_metric_t pkt_error_internal_server_error_out;
// Data Sent/Received Metrics
extern prometheus::simpleapi::counter_family_t data;
extern prometheus::simpleapi::counter_metric_t udp_send;
extern prometheus::simpleapi::counter_metric_t udp_recv;
extern prometheus::simpleapi::counter_metric_t tcp_send;
extern prometheus::simpleapi::counter_metric_t tcp_recv;
// Network Metrics
extern prometheus::simpleapi::gauge_metric_t network_num_joined;
extern prometheus::simpleapi::gauge_family_t network_num_multicast_groups;
extern prometheus::simpleapi::counter_family_t network_incoming_packets;
extern prometheus::simpleapi::counter_family_t network_outgoing_packets;
extern prometheus::simpleapi::gauge_metric_t network_num_joined;
extern prometheus::simpleapi::gauge_family_t network_num_multicast_groups;
extern prometheus::simpleapi::counter_family_t network_packets;
#ifndef ZT_NO_PEER_METRICS
// Peer Metrics
extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency;
extern prometheus::simpleapi::gauge_family_t peer_path_count;
extern prometheus::simpleapi::counter_family_t peer_incoming_packets;
extern prometheus::simpleapi::counter_family_t peer_outgoing_packets;
extern prometheus::simpleapi::gauge_family_t peer_path_count;
extern prometheus::simpleapi::counter_family_t peer_packets;
extern prometheus::simpleapi::counter_family_t peer_packet_errors;
#endif
// General Controller Metrics
extern prometheus::simpleapi::gauge_metric_t network_count;

View file

@ -569,10 +569,10 @@ Network::Network(const RuntimeEnvironment *renv,void *tPtr,uint64_t nwid,void *u
_netconfFailure(NETCONF_FAILURE_NONE),
_portError(0),
_num_multicast_groups{Metrics::network_num_multicast_groups.Add({{"network_id", _nwidStr}})},
_incoming_packets_accpeted{Metrics::network_incoming_packets.Add({{"network_id", _nwidStr},{"accepted","yes"}})},
_incoming_packets_dropped{Metrics::network_incoming_packets.Add({{"network_id", _nwidStr},{"accepted","no"}})},
_outgoing_packets_accepted{Metrics::network_outgoing_packets.Add({{"network_id", _nwidStr},{"accepted","yes"}})},
_outgoing_packets_dropped{Metrics::network_outgoing_packets.Add({{"network_id", _nwidStr},{"accepted","no"}})}
_incoming_packets_accepted{Metrics::network_packets.Add({{"direction","rx"},{"network_id", _nwidStr},{"accepted","yes"}})},
_incoming_packets_dropped{Metrics::network_packets.Add({{"direction","rx"},{"network_id", _nwidStr},{"accepted","no"}})},
_outgoing_packets_accepted{Metrics::network_packets.Add({{"direction","tx"},{"network_id", _nwidStr},{"accepted","yes"}})},
_outgoing_packets_dropped{Metrics::network_packets.Add({{"direction","tx"},{"network_id", _nwidStr},{"accepted","no"}})}
{
for(int i=0;i<ZT_NETWORK_MAX_INCOMING_UPDATES;++i) {
_incomingConfigChunks[i].ts = 0;
@ -837,7 +837,7 @@ int Network::filterIncomingPacket(
}
if (accept) {
_incoming_packets_accpeted++;
_incoming_packets_accepted++;
if (cc) {
Packet outp(cc,RR->identity.address(),Packet::VERB_EXT_FRAME);
outp.append(_id);

View file

@ -483,7 +483,7 @@ private:
AtomicCounter __refCount;
prometheus::simpleapi::gauge_metric_t _num_multicast_groups;
prometheus::simpleapi::counter_metric_t _incoming_packets_accpeted;
prometheus::simpleapi::counter_metric_t _incoming_packets_accepted;
prometheus::simpleapi::counter_metric_t _incoming_packets_dropped;
prometheus::simpleapi::counter_metric_t _outgoing_packets_accepted;
prometheus::simpleapi::counter_metric_t _outgoing_packets_dropped;

View file

@ -28,35 +28,37 @@ namespace ZeroTier {
static unsigned char s_freeRandomByteCounter = 0;
Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) :
RR(renv),
_lastReceive(0),
_lastNontrivialReceive(0),
_lastTriedMemorizedPath(0),
_lastDirectPathPushSent(0),
_lastDirectPathPushReceive(0),
_lastCredentialRequestSent(0),
_lastWhoisRequestReceived(0),
_lastCredentialsReceived(0),
_lastTrustEstablishedPacketReceived(0),
_lastSentFullHello(0),
_lastEchoCheck(0),
_freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter),
_vProto(0),
_vMajor(0),
_vMinor(0),
_vRevision(0),
_id(peerIdentity),
_directPathPushCutoffCount(0),
_echoRequestCutoffCount(0),
_localMultipathSupported(false),
_lastComputedAggregateMeanLatency(0),
_peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})},
_alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})},
_dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})},
_incoming_packet{Metrics::peer_incoming_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
_outgoing_packet{Metrics::peer_outgoing_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
_packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity)
: RR(renv)
, _lastReceive(0)
, _lastNontrivialReceive(0)
, _lastTriedMemorizedPath(0)
, _lastDirectPathPushSent(0)
, _lastDirectPathPushReceive(0)
, _lastCredentialRequestSent(0)
, _lastWhoisRequestReceived(0)
, _lastCredentialsReceived(0)
, _lastTrustEstablishedPacketReceived(0)
, _lastSentFullHello(0)
, _lastEchoCheck(0)
, _freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter)
, _vProto(0)
, _vMajor(0)
, _vMinor(0)
, _vRevision(0)
, _id(peerIdentity)
, _directPathPushCutoffCount(0)
, _echoRequestCutoffCount(0)
, _localMultipathSupported(false)
, _lastComputedAggregateMeanLatency(0)
#ifndef ZT_NO_PEER_METRICS
, _peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})}
, _alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})}
, _dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})}
, _incoming_packet{Metrics::peer_packets.Add({{"direction", "rx"},{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
, _outgoing_packet{Metrics::peer_packets.Add({{"direction", "tx"},{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
, _packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
#endif
{
if (!myIdentity.agree(peerIdentity,_key)) {
throw ZT_EXCEPTION_INVALID_ARGUMENT;
@ -97,7 +99,9 @@ void Peer::received(
default:
break;
}
#ifndef ZT_NO_PEER_METRICS
_incoming_packet++;
#endif
recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
if (trustEstablished) {
@ -569,6 +573,7 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
deletionOccurred = false;
}
}
#ifndef ZT_NO_PEER_METRICS
uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0;
for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
if (_paths[i].p) {
@ -582,8 +587,11 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
}
_alive_path_count = alive_path_count_tmp;
_dead_path_count = dead_path_count_tmp;
#endif
}
#ifndef ZT_NO_PEER_METRICS
_peer_latency.Observe(latency(now));
#endif
return sent;
}
@ -658,7 +666,9 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
{
#ifndef ZT_NO_PEER_METRICS
_outgoing_packet++;
#endif
if (_localMultipathSupported && _bond) {
_bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
}
@ -666,7 +676,9 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
{
#ifndef ZT_NO_PEER_METRICS
_packet_errors++;
#endif
if (_localMultipathSupported && _bond) {
_bond->recordIncomingInvalidPacket(path);
}

View file

@ -599,12 +599,14 @@ private:
SharedPtr<Bond> _bond;
#ifndef ZT_NO_PEER_METRICS
prometheus::Histogram<uint64_t> &_peer_latency;
prometheus::simpleapi::gauge_metric_t _alive_path_count;
prometheus::simpleapi::gauge_metric_t _dead_path_count;
prometheus::simpleapi::counter_metric_t _incoming_packet;
prometheus::simpleapi::counter_metric_t _outgoing_packet;
prometheus::simpleapi::counter_metric_t _packet_errors;
#endif
};
} // namespace ZeroTier