From 00d55fc4b407eb91382ea412f99b007631f923b5 Mon Sep 17 00:00:00 2001
From: Grant Limberg <glimberg@users.noreply.github.com>
Date: Thu, 4 May 2023 11:12:55 -0700
Subject: [PATCH] Metrics consolidation (#1997)

* Rename zt_packet_incoming -> zt_packet

Also consolidate zt_peer_packets into a single metric with tx and rx labels.  Same for ztc_tcp_data and ztc_udp_data

* Further collapse tcp & udp into metric labels for zt_data

* Fix zt_data metric description

* zt_peer_packets description fix

* Consolidate incoming/outgoing network packets to a single metric

* zt_incoming_packet_error -> zt_packet_error

* Disable peer metrics for central controllers

Can change in the future if needed, but given the traffic our controllers serve, that's going to be a *lot* of data

* Disable peer metrics for controllers pt 2
---
 make-linux.mk    |  2 +-
 node/Metrics.cpp | 34 +++++++++++------------
 node/Metrics.hpp | 15 ++++++-----
 node/Network.cpp | 10 +++----
 node/Network.hpp |  2 +-
 node/Peer.cpp    | 70 ++++++++++++++++++++++++++++--------------------
 node/Peer.hpp    |  2 ++
 7 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/make-linux.mk b/make-linux.mk
index ede843c5f..3b448afb0 100644
--- a/make-linux.mk
+++ b/make-linux.mk
@@ -311,7 +311,7 @@ endif
 ifeq ($(ZT_CONTROLLER),1)
 	override CXXFLAGS+=-Wall -Wno-deprecated -std=c++17 -pthread $(INCLUDES) -DNDEBUG $(DEFS)
 	override LDLIBS+=-Lext/libpqxx-7.7.3/install/ubuntu22.04/lib -lpqxx -lpq ext/hiredis-1.0.2/lib/ubuntu22.04/libhiredis.a ext/redis-plus-plus-1.3.3/install/ubuntu22.04/lib/libredis++.a -lssl -lcrypto
-	override DEFS+=-DZT_CONTROLLER_USE_LIBPQ
+	override DEFS+=-DZT_CONTROLLER_USE_LIBPQ -DZT_NO_PEER_METRICS
 	override INCLUDES+=-I/usr/include/postgresql -Iext/libpqxx-7.7.3/install/ubuntu22.04/include -Iext/hiredis-1.0.2/include/ -Iext/redis-plus-plus-1.3.3/install/ubuntu22.04/include/sw/
 endif
 
diff --git a/node/Metrics.cpp b/node/Metrics.cpp
index e20f06c32..ba168bcc9 100644
--- a/node/Metrics.cpp
+++ b/node/Metrics.cpp
@@ -25,7 +25,7 @@ namespace ZeroTier {
     namespace Metrics {
         // Packet Type Counts
         prometheus::simpleapi::counter_family_t packets
-        { "zt_packet_incoming", "incoming packet type counts"};
+        { "zt_packet", "incoming packet type counts"};
 
         // Incoming packets
         prometheus::simpleapi::counter_metric_t pkt_nop_in
@@ -118,7 +118,7 @@ namespace ZeroTier {
 
         // Packet Error Counts
         prometheus::simpleapi::counter_family_t packet_errors
-        { "zt_packet_incoming_error", "incoming packet errors"};
+        { "zt_packet_error", "incoming packet errors"};
 
         // Incoming Error Counts
         prometheus::simpleapi::counter_metric_t pkt_error_obj_not_found_in
@@ -157,25 +157,26 @@ namespace ZeroTier {
         { packet_errors.Add({{"error_type", "internal_server_error"}, {"direction", "tx"}}) };
 
         // Data Sent/Received Metrics
-        prometheus::simpleapi::counter_metric_t udp_send
-        { "zt_udp_data_sent", "number of bytes ZeroTier has sent via UDP" };
+        prometheus::simpleapi::counter_family_t data
+        { "zt_data", "number of bytes ZeroTier has transmitted or received" };
         prometheus::simpleapi::counter_metric_t udp_recv
-        { "zt_udp_data_recv", "number of bytes ZeroTier has received via UDP" };
+        { data.Add({{"protocol","udp"},{"direction","rx"}}) };
+        prometheus::simpleapi::counter_metric_t udp_send
+        { data.Add({{"protocol","udp"},{"direction","tx"}}) };
         prometheus::simpleapi::counter_metric_t tcp_send
-        { "zt_tcp_data_sent", "number of bytes ZeroTier has sent via TCP" };
+        { data.Add({{"protocol","tcp"},{"direction", "tx"}}) };
         prometheus::simpleapi::counter_metric_t tcp_recv
-        { "zt_tcp_data_recv", "number of bytes ZeroTier has received via TCP" };
+        { data.Add({{"protocol","tcp"},{"direction", "rx"}}) };
 
         // Network Metrics
         prometheus::simpleapi::gauge_metric_t network_num_joined
         { "zt_num_networks", "number of networks this instance is joined to" };
         prometheus::simpleapi::gauge_family_t network_num_multicast_groups
-        { "zt_network_multcast_groups_subscribed", "number of multicast groups networks are subscribed to" };
-        prometheus::simpleapi::counter_family_t network_incoming_packets
-        { "zt_network_incoming_packets", "number of incoming packets per network" };
-        prometheus::simpleapi::counter_family_t network_outgoing_packets
-        { "zt_network_outgoing_packets", "number of outgoing packets per network" };
-
+        { "zt_network_multicast_groups_subscribed", "number of multicast groups networks are subscribed to" };
+        prometheus::simpleapi::counter_family_t network_packets
+        { "zt_network_packets", "number of incoming/outgoing packets per network" };
+        
+#ifndef ZT_NO_PEER_METRICS
         // PeerMetrics
         prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency = 
         prometheus::Builder<prometheus::Histogram<uint64_t>>()
@@ -185,12 +186,11 @@ namespace ZeroTier {
     
         prometheus::simpleapi::gauge_family_t peer_path_count
         { "zt_peer_path_count", "number of paths to peer" };
-        prometheus::simpleapi::counter_family_t peer_incoming_packets
-        { "zt_peer_incoming_packets", "number of incoming packets from a peer" };
-        prometheus::simpleapi::counter_family_t peer_outgoing_packets
-        { "zt_peer_outgoing_packets", "number of outgoing packets to a peer" };
+        prometheus::simpleapi::counter_family_t peer_packets
+        { "zt_peer_packets", "number of packets to/from a peer" };
         prometheus::simpleapi::counter_family_t peer_packet_errors
         { "zt_peer_packet_errors" , "number of incoming packet errors from a peer" };
+#endif
 
         // General Controller Metrics
         prometheus::simpleapi::gauge_metric_t   network_count
diff --git a/node/Metrics.hpp b/node/Metrics.hpp
index f78a0f157..66b97c0d6 100644
--- a/node/Metrics.hpp
+++ b/node/Metrics.hpp
@@ -96,23 +96,24 @@ namespace ZeroTier {
         extern prometheus::simpleapi::counter_metric_t pkt_error_internal_server_error_out;
 
         // Data Sent/Received Metrics
+        extern prometheus::simpleapi::counter_family_t data;
         extern prometheus::simpleapi::counter_metric_t udp_send;
         extern prometheus::simpleapi::counter_metric_t udp_recv;
         extern prometheus::simpleapi::counter_metric_t tcp_send;
         extern prometheus::simpleapi::counter_metric_t tcp_recv;
 
         // Network Metrics
-        extern prometheus::simpleapi::gauge_metric_t network_num_joined;
-        extern prometheus::simpleapi::gauge_family_t network_num_multicast_groups;
-        extern prometheus::simpleapi::counter_family_t network_incoming_packets;
-        extern prometheus::simpleapi::counter_family_t network_outgoing_packets;
+        extern prometheus::simpleapi::gauge_metric_t   network_num_joined;
+        extern prometheus::simpleapi::gauge_family_t   network_num_multicast_groups;
+        extern prometheus::simpleapi::counter_family_t network_packets;
 
+#ifndef ZT_NO_PEER_METRICS
         // Peer Metrics
         extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &peer_latency;
-        extern prometheus::simpleapi::gauge_family_t peer_path_count;
-        extern prometheus::simpleapi::counter_family_t peer_incoming_packets;
-        extern prometheus::simpleapi::counter_family_t peer_outgoing_packets;
+        extern prometheus::simpleapi::gauge_family_t   peer_path_count;
+        extern prometheus::simpleapi::counter_family_t peer_packets;
         extern prometheus::simpleapi::counter_family_t peer_packet_errors;
+#endif
 
         // General Controller Metrics
         extern prometheus::simpleapi::gauge_metric_t   network_count;
diff --git a/node/Network.cpp b/node/Network.cpp
index 10436aedb..1e77e4636 100644
--- a/node/Network.cpp
+++ b/node/Network.cpp
@@ -569,10 +569,10 @@ Network::Network(const RuntimeEnvironment *renv,void *tPtr,uint64_t nwid,void *u
 	_netconfFailure(NETCONF_FAILURE_NONE),
 	_portError(0),
 	_num_multicast_groups{Metrics::network_num_multicast_groups.Add({{"network_id", _nwidStr}})},
-	_incoming_packets_accpeted{Metrics::network_incoming_packets.Add({{"network_id", _nwidStr},{"accepted","yes"}})},
-	_incoming_packets_dropped{Metrics::network_incoming_packets.Add({{"network_id", _nwidStr},{"accepted","no"}})},
-	_outgoing_packets_accepted{Metrics::network_outgoing_packets.Add({{"network_id", _nwidStr},{"accepted","yes"}})},
-	_outgoing_packets_dropped{Metrics::network_outgoing_packets.Add({{"network_id", _nwidStr},{"accepted","no"}})}
+	_incoming_packets_accepted{Metrics::network_packets.Add({{"direction","rx"},{"network_id", _nwidStr},{"accepted","yes"}})},
+	_incoming_packets_dropped{Metrics::network_packets.Add({{"direction","rx"},{"network_id", _nwidStr},{"accepted","no"}})},
+	_outgoing_packets_accepted{Metrics::network_packets.Add({{"direction","tx"},{"network_id", _nwidStr},{"accepted","yes"}})},
+	_outgoing_packets_dropped{Metrics::network_packets.Add({{"direction","tx"},{"network_id", _nwidStr},{"accepted","no"}})}
 {
 	for(int i=0;i<ZT_NETWORK_MAX_INCOMING_UPDATES;++i) {
 		_incomingConfigChunks[i].ts = 0;
@@ -837,7 +837,7 @@ int Network::filterIncomingPacket(
 	}
 
 	if (accept) {
-		_incoming_packets_accpeted++;
+		_incoming_packets_accepted++;
 		if (cc) {
 			Packet outp(cc,RR->identity.address(),Packet::VERB_EXT_FRAME);
 			outp.append(_id);
diff --git a/node/Network.hpp b/node/Network.hpp
index 676e5556e..a3bce14af 100644
--- a/node/Network.hpp
+++ b/node/Network.hpp
@@ -483,7 +483,7 @@ private:
 	AtomicCounter __refCount;
 
 	prometheus::simpleapi::gauge_metric_t _num_multicast_groups;
-	prometheus::simpleapi::counter_metric_t _incoming_packets_accpeted;
+	prometheus::simpleapi::counter_metric_t _incoming_packets_accepted;
 	prometheus::simpleapi::counter_metric_t _incoming_packets_dropped;
 	prometheus::simpleapi::counter_metric_t _outgoing_packets_accepted;
 	prometheus::simpleapi::counter_metric_t _outgoing_packets_dropped;
diff --git a/node/Peer.cpp b/node/Peer.cpp
index a08bebbf7..6fcf193d9 100644
--- a/node/Peer.cpp
+++ b/node/Peer.cpp
@@ -28,35 +28,37 @@ namespace ZeroTier {
 
 static unsigned char s_freeRandomByteCounter = 0;
 
-Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) :
-	RR(renv),
-	_lastReceive(0),
-	_lastNontrivialReceive(0),
-	_lastTriedMemorizedPath(0),
-	_lastDirectPathPushSent(0),
-	_lastDirectPathPushReceive(0),
-	_lastCredentialRequestSent(0),
-	_lastWhoisRequestReceived(0),
-	_lastCredentialsReceived(0),
-	_lastTrustEstablishedPacketReceived(0),
-	_lastSentFullHello(0),
-	_lastEchoCheck(0),
-	_freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter),
-	_vProto(0),
-	_vMajor(0),
-	_vMinor(0),
-	_vRevision(0),
-	_id(peerIdentity),
-	_directPathPushCutoffCount(0),
-	_echoRequestCutoffCount(0),
-	_localMultipathSupported(false),
-	_lastComputedAggregateMeanLatency(0),
-	_peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})},
-	_alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})},
-	_dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})},
-	_incoming_packet{Metrics::peer_incoming_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
-	_outgoing_packet{Metrics::peer_outgoing_packets.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})},
-	_packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
+Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) 
+	: RR(renv)
+	, _lastReceive(0)
+	, _lastNontrivialReceive(0)
+	, _lastTriedMemorizedPath(0)
+	, _lastDirectPathPushSent(0)
+	, _lastDirectPathPushReceive(0)
+	, _lastCredentialRequestSent(0)
+	, _lastWhoisRequestReceived(0)
+	, _lastCredentialsReceived(0)
+	, _lastTrustEstablishedPacketReceived(0)
+	, _lastSentFullHello(0)
+	, _lastEchoCheck(0)
+	, _freeRandomByte((unsigned char)((uintptr_t)this >> 4) ^ ++s_freeRandomByteCounter)
+	, _vProto(0)
+	, _vMajor(0)
+	, _vMinor(0)
+	, _vRevision(0)
+	, _id(peerIdentity)
+	, _directPathPushCutoffCount(0)
+	, _echoRequestCutoffCount(0)
+	, _localMultipathSupported(false)
+	, _lastComputedAggregateMeanLatency(0)
+#ifndef ZT_NO_PEER_METRICS
+	, _peer_latency{Metrics::peer_latency.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}}, std::vector<uint64_t>{1,3,6,10,30,60,100,300,600,1000})}
+	, _alive_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","alive"}})}
+	, _dead_path_count{Metrics::peer_path_count.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())},{"status","dead"}})}
+	, _incoming_packet{Metrics::peer_packets.Add({{"direction", "rx"},{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
+	, _outgoing_packet{Metrics::peer_packets.Add({{"direction", "tx"},{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
+	, _packet_errors{Metrics::peer_packet_errors.Add({{"node_id", OSUtils::nodeIDStr(peerIdentity.address().toInt())}})}
+#endif
 {
 	if (!myIdentity.agree(peerIdentity,_key)) {
 		throw ZT_EXCEPTION_INVALID_ARGUMENT;
@@ -97,7 +99,9 @@ void Peer::received(
 		default:
 			break;
 	}
+#ifndef ZT_NO_PEER_METRICS
 	_incoming_packet++;
+#endif
 	recordIncomingPacket(path, packetId, payloadLength, verb, flowId, now);
 
 	if (trustEstablished) {
@@ -569,6 +573,7 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 				deletionOccurred = false;
 			}
 		}
+#ifndef ZT_NO_PEER_METRICS
 		uint16_t alive_path_count_tmp = 0, dead_path_count_tmp = 0;
 		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 			if (_paths[i].p) {
@@ -582,8 +587,11 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 		}
 		_alive_path_count = alive_path_count_tmp;
 		_dead_path_count = dead_path_count_tmp;
+#endif
 	}
+#ifndef ZT_NO_PEER_METRICS
 	_peer_latency.Observe(latency(now));
+#endif
 	return sent;
 }
 
@@ -658,7 +666,9 @@ void Peer::resetWithinScope(void *tPtr,InetAddress::IpScope scope,int inetAddres
 void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
 	uint16_t payloadLength, const Packet::Verb verb, const int32_t flowId, int64_t now)
 {
+#ifndef ZT_NO_PEER_METRICS
 	_outgoing_packet++;
+#endif
 	if (_localMultipathSupported && _bond) {
 		_bond->recordOutgoingPacket(path, packetId, payloadLength, verb, flowId, now);
 	}
@@ -666,7 +676,9 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
 
 void Peer::recordIncomingInvalidPacket(const SharedPtr<Path>& path)
 {
+#ifndef ZT_NO_PEER_METRICS
 	_packet_errors++;
+#endif
 	if (_localMultipathSupported && _bond) {
 		_bond->recordIncomingInvalidPacket(path);
 	}
diff --git a/node/Peer.hpp b/node/Peer.hpp
index cd6b871fe..d03e8f884 100644
--- a/node/Peer.hpp
+++ b/node/Peer.hpp
@@ -599,12 +599,14 @@ private:
 
 	SharedPtr<Bond> _bond;
 
+#ifndef ZT_NO_PEER_METRICS
 	prometheus::Histogram<uint64_t> &_peer_latency;
 	prometheus::simpleapi::gauge_metric_t _alive_path_count;
 	prometheus::simpleapi::gauge_metric_t _dead_path_count;
 	prometheus::simpleapi::counter_metric_t _incoming_packet;
 	prometheus::simpleapi::counter_metric_t _outgoing_packet;
 	prometheus::simpleapi::counter_metric_t _packet_errors;
+#endif
 };
 
 } // namespace ZeroTier