Add Prometheus metrics for packet fragmentation monitoring in nodes

- Add comprehensive VL1 (ZeroTier protocol) fragmentation metrics:
  * Track fragmented packets, fragments, reassembly failures
  * Monitor fragment ordering issues and duplicates
  * Histogram for fragments per packet distribution

- Add VL2 (TAP/Ethernet) fragmentation metrics for virtual ethernet interfaces:
  * Track oversized frames from TAP devices
  * Monitor frames that would fragment or drop
  * Histogram for frame size distribution with common MTU buckets

- Integration across all TAP implementations (Linux, Mac, BSD, Windows)

This allows monitoring of fragmentation patterns for nodes participating
as members in ZeroTier networks, helping identify MTU mismatches and
optimize virtual ethernet performance.
This commit is contained in:
Aaron Johnson 2025-07-08 12:55:28 -07:00 committed by Lennon Day Reynolds
parent 5232af0a61
commit 8285e0f45b
9 changed files with 207 additions and 30 deletions

View file

@ -13,6 +13,7 @@
// clang-format off
#include <prometheus/simpleapi.h>
#include <prometheus/histogram.h>
#include "Metrics.hpp"
// clang-format on
namespace prometheus {
@ -162,5 +163,68 @@ prometheus::simpleapi::gauge_metric_t pool_avail { "controller_pgsql_available_c
prometheus::simpleapi::gauge_metric_t pool_in_use { "controller_pgsql_in_use_conns", "number of postgres database connections in use" };
prometheus::simpleapi::counter_metric_t pool_errors { "controller_pgsql_connection_errors", "number of connection errors the connection pool has seen" };
#endif
} // namespace Metrics
} // namespace ZeroTier
// Fragmentation Metrics
prometheus::simpleapi::counter_family_t packet_fragmentation
{ "zt_packet_fragmentation", "ZeroTier packet fragmentation events" };
// VL2 Fragmentation Metrics
prometheus::simpleapi::counter_metric_t vl2_oversized_frame_tx
{ packet_fragmentation.Add({{"layer", "VL2"}, {"direction", "tx"}, {"reason", "oversized_frame"}}) };
prometheus::simpleapi::counter_metric_t vl2_would_fragment_or_drop_rx
{ packet_fragmentation.Add({{"layer", "VL2"}, {"direction", "rx"}, {"reason", "would_fragment_or_drop"}}) };
// VL1 Fragmentation Metrics
prometheus::simpleapi::counter_metric_t vl1_fragmented_tx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "tx"}, {"reason", "mtu_exceeded"}}) };
prometheus::simpleapi::counter_metric_t vl1_fragment_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "fragment"}}) };
prometheus::simpleapi::counter_metric_t vl1_reassembly_failed_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "reassembly_failed"}}) };
prometheus::simpleapi::counter_metric_t vl1_fragment_without_head_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "fragment_without_head"}}) };
prometheus::simpleapi::counter_metric_t vl1_fragment_before_head_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "fragment_before_head"}}) };
prometheus::simpleapi::counter_metric_t vl1_duplicate_fragment_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "duplicate_fragment"}}) };
prometheus::simpleapi::counter_metric_t vl1_duplicate_head_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "duplicate_head"}}) };
// VL1 Fragmentation Histogram and Counters
prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &vl1_fragments_per_packet_histogram =
prometheus::Builder<prometheus::Histogram<uint64_t>>()
.Name("zt_vl1_fragments_per_packet")
.Help("Histogram of fragments per packet at VL1")
.Register(prometheus::simpleapi::registry);
prometheus::simpleapi::counter_metric_t vl1_incomplete_reassembly_rx
{ packet_fragmentation.Add({{"layer", "VL1"}, {"direction", "rx"}, {"reason", "incomplete_reassembly"}}) };
prometheus::simpleapi::counter_metric_t vl1_vl2_double_fragmentation_tx
{ packet_fragmentation.Add({{"layer", "VL1_VL2"}, {"direction", "tx"}, {"reason", "double_fragmentation"}}) };
prometheus::Histogram<uint64_t> &vl1_fragments_per_packet_hist =
vl1_fragments_per_packet_histogram.Add(
{},
std::vector<uint64_t>(std::begin(ZeroTier::Metrics::VL1_FRAGMENTS_PER_PACKET_BUCKETS), std::end(ZeroTier::Metrics::VL1_FRAGMENTS_PER_PACKET_BUCKETS))
);
// VL2 Frame Size Histogram
// Buckets: 512 (IoT/legacy), 576 (min IPv4), 1200 (QUIC/mobile), 1280 (min IPv6),
// 1332, 1380, 1400 (VPN/overlay), 1420 (cloud), 1460 (TCP MSS), 1472 (ICMP/MTU),
// 1480 (ICMP/MTU), 1492 (PPPoE), 1500 (Ethernet), 2800 (VL2 default), 9000 (jumbo)
prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &vl2_frame_size_histogram =
prometheus::Builder<prometheus::Histogram<uint64_t>>()
.Name("zt_vl2_frame_size")
.Help("Histogram of frame sizes delivered to TAP (VL2)")
.Register(prometheus::simpleapi::registry);
prometheus::simpleapi::counter_metric_t vl2_incomplete_reassembly_rx
{ packet_fragmentation.Add({{"layer", "VL2"}, {"direction", "rx"}, {"reason", "incomplete_reassembly"}}) };
prometheus::simpleapi::counter_metric_t vl2_vl1_double_fragmentation_tx
{ packet_fragmentation.Add({{"layer", "VL2_VL1"}, {"direction", "tx"}, {"reason", "double_fragmentation"}}) };
prometheus::Histogram<uint64_t> &vl2_frame_size_hist =
vl2_frame_size_histogram.Add(
{},
std::vector<uint64_t>(std::begin(ZeroTier::Metrics::VL2_FRAME_SIZE_BUCKETS), std::end(ZeroTier::Metrics::VL2_FRAME_SIZE_BUCKETS))
);
}
}

View file

@ -139,6 +139,38 @@ extern prometheus::simpleapi::counter_metric_t db_get_network_list;
extern prometheus::simpleapi::counter_metric_t db_member_change;
extern prometheus::simpleapi::counter_metric_t db_network_change;
// Fragmentation Metrics
extern prometheus::simpleapi::counter_family_t packet_fragmentation;
// VL2 Fragmentation Metrics
extern prometheus::simpleapi::counter_metric_t vl2_oversized_frame_tx;
extern prometheus::simpleapi::counter_metric_t vl2_would_fragment_or_drop_rx;
// VL1 Fragmentation Metrics
extern prometheus::simpleapi::counter_metric_t vl1_fragmented_tx;
extern prometheus::simpleapi::counter_metric_t vl1_fragment_rx;
extern prometheus::simpleapi::counter_metric_t vl1_reassembly_failed_rx;
extern prometheus::simpleapi::counter_metric_t vl1_fragment_without_head_rx;
extern prometheus::simpleapi::counter_metric_t vl1_fragment_before_head_rx;
extern prometheus::simpleapi::counter_metric_t vl1_duplicate_fragment_rx;
extern prometheus::simpleapi::counter_metric_t vl1_duplicate_head_rx;
// VL1 Fragmentation Histogram and Counters
extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &vl1_fragments_per_packet_histogram;
extern prometheus::simpleapi::counter_metric_t vl1_incomplete_reassembly_rx;
extern prometheus::simpleapi::counter_metric_t vl1_vl2_double_fragmentation_tx;
// VL2 Frame Size Histogram
// Buckets: 512 (IoT/legacy), 576 (min IPv4), 1200 (QUIC/mobile), 1280 (min IPv6),
// 1332, 1380, 1400 (VPN/overlay), 1420 (cloud), 1460 (TCP MSS), 1472 (ICMP/MTU),
// 1480 (ICMP/MTU), 1492 (PPPoE), 1500 (Ethernet), 2800 (VL2 default), 9000 (jumbo)
extern prometheus::CustomFamily<prometheus::Histogram<uint64_t>> &vl2_frame_size_histogram;
// Histogram bucket boundaries for VL1 fragments per packet
inline constexpr uint64_t VL1_FRAGMENTS_PER_PACKET_BUCKETS[] = {1,2,3,4,5,6,7,8,9,10,12,16};
// Histogram bucket boundaries for VL2 frame size
inline constexpr uint64_t VL2_FRAME_SIZE_BUCKETS[] = {512,576,1200,1280,1332,1380,1400,1420,1460,1472,1480,1492,1500,2800,9000};
#ifdef ZT_CONTROLLER_USE_LIBPQ
// Central Controller Metrics
extern prometheus::simpleapi::counter_metric_t pgsql_mem_notification;
@ -159,7 +191,10 @@ extern prometheus::simpleapi::gauge_metric_t pool_avail;
extern prometheus::simpleapi::gauge_metric_t pool_in_use;
extern prometheus::simpleapi::counter_metric_t pool_errors;
#endif
} // namespace Metrics
} // namespace ZeroTier
extern prometheus::Histogram<uint64_t> &vl1_fragments_per_packet_hist;
extern prometheus::Histogram<uint64_t> &vl2_frame_size_hist;
} // namespace Metrics
}// namespace ZeroTier
#endif // METRICS_H_

View file

@ -121,7 +121,7 @@ void Switch::onRemotePacket(void* tPtr, const int64_t localSocket, const InetAdd
Mutex::Lock rql(rq->lock);
if (rq->packetId != fragmentPacketId) {
// No packet found, so we received a fragment without its head.
Metrics::vl1_fragment_without_head_rx++;
rq->flowId = flowId;
rq->timestamp = now;
rq->packetId = fragmentPacketId;
@ -132,7 +132,7 @@ void Switch::onRemotePacket(void* tPtr, const int64_t localSocket, const InetAdd
}
else if (! (rq->haveFragments & (1 << fragmentNumber))) {
// We have other fragments and maybe the head, so add this one and check
Metrics::vl1_fragment_before_head_rx++;
rq->frags[fragmentNumber - 1] = fragment;
rq->totalFragments = totalFragments;
@ -143,14 +143,17 @@ void Switch::onRemotePacket(void* tPtr, const int64_t localSocket, const InetAdd
rq->frag0.append(rq->frags[f - 1].payload(), rq->frags[f - 1].payloadLength());
}
if (rq->frag0.tryDecode(RR, tPtr, flowId)) {
rq->timestamp = 0; // packet decoded, free entry
}
else {
rq->complete = true; // set complete flag but leave entry since it probably needs WHOIS or something
if (rq->frag0.tryDecode(RR,tPtr,flowId)) {
rq->timestamp = 0; // packet decoded, free entry
} else {
rq->complete = true; // set complete flag but leave entry since it probably needs WHOIS or something
Metrics::vl1_reassembly_failed_rx++;
}
}
} // else this is a duplicate fragment, ignore
} else {
// This is a duplicate fragment, ignore
Metrics::vl1_duplicate_fragment_rx++;
}
}
}
@ -230,19 +233,21 @@ void Switch::onRemotePacket(void* tPtr, const int64_t localSocket, const InetAdd
}
if (rq->frag0.tryDecode(RR, tPtr, flowId)) {
rq->timestamp = 0; // packet decoded, free entry
}
else {
rq->complete = true; // set complete flag but leave entry since it probably needs WHOIS or something
rq->timestamp = 0; // packet decoded, free entry
} else {
rq->complete = true; // set complete flag but leave entry since it probably needs WHOIS or something
Metrics::vl1_reassembly_failed_rx++;
}
}
else {
// Still waiting on more fragments, but keep the head
rq->frag0.init(data, len, path, now);
}
} // else this is a duplicate head, ignore
}
else {
} else {
// This is a duplicate head, ignore
Metrics::vl1_duplicate_head_rx++;
}
} else {
// Packet is unfragmented, so just process it
IncomingPacket packet(data, len, path, now);
if (! packet.tryDecode(RR, tPtr, flowId)) {
@ -268,7 +273,16 @@ void Switch::onRemotePacket(void* tPtr, const int64_t localSocket, const InetAdd
void Switch::onLocalEthernet(void* tPtr, const SharedPtr<Network>& network, const MAC& from, const MAC& to, unsigned int etherType, unsigned int vlanId, const void* data, unsigned int len)
{
if (! network->hasConfig()) {
if (!network->hasConfig()) {
return;
}
// VL2 fragmentation metric: oversized frame from TAP device (TX)
unsigned int tap_mtu = network->config().mtu;
bool was_fragmented_at_vl2 = (len > tap_mtu);
if (was_fragmented_at_vl2) {
Metrics::vl2_oversized_frame_tx++;
// Just measure, do not drop or return
return;
}
@ -960,9 +974,17 @@ void Switch::doAnythingWaitingForPeer(void* tPtr, const SharedPtr<Peer>& peer)
for (unsigned int ptr = 0; ptr < ZT_RX_QUEUE_SIZE; ++ptr) {
RXQueueEntry* const rq = &(_rxQueue[ptr]);
Mutex::Lock rql(rq->lock);
if ((rq->timestamp) && (rq->complete)) {
if ((rq->frag0.tryDecode(RR, tPtr, rq->flowId)) || ((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT)) {
if ((rq->timestamp)&&(rq->complete)) {
if ((rq->frag0.tryDecode(RR, tPtr, rq->flowId))||((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT)) {
rq->timestamp = 0;
if ((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT) {
Metrics::vl1_incomplete_reassembly_rx++;
}
} else {
const Address src(rq->frag0.source());
if (!RR->topology->getPeer(tPtr,src)) {
requestWhois(tPtr,now,src);
}
}
}
}
@ -1019,9 +1041,12 @@ unsigned long Switch::doTimerTasks(void* tPtr, int64_t now)
for (unsigned int ptr = 0; ptr < ZT_RX_QUEUE_SIZE; ++ptr) {
RXQueueEntry* const rq = &(_rxQueue[ptr]);
Mutex::Lock rql(rq->lock);
if ((rq->timestamp) && (rq->complete)) {
if ((rq->frag0.tryDecode(RR, tPtr, rq->flowId)) || ((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT)) {
if ((rq->timestamp)&&(rq->complete)) {
if ((rq->frag0.tryDecode(RR, tPtr, rq->flowId))||((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT)) {
rq->timestamp = 0;
if ((now - rq->timestamp) > ZT_RECEIVE_QUEUE_TIMEOUT) {
Metrics::vl1_incomplete_reassembly_rx++;
}
}
else {
const Address src(rq->frag0.source());
@ -1084,7 +1109,7 @@ bool Switch::_trySend(void* tPtr, Packet& packet, bool encrypt, int32_t flowId)
for (int i = 0; i < ZT_MAX_PEER_NETWORK_PATHS; ++i) {
if (peer->_paths[i].p && peer->_paths[i].p->alive(now)) {
uint16_t userSpecifiedMtu = peer->_paths[i].p->mtu();
_sendViaSpecificPath(tPtr, peer, peer->_paths[i].p, userSpecifiedMtu, now, packet, encrypt, flowId);
_sendViaSpecificPath(tPtr, peer, peer->_paths[i].p, userSpecifiedMtu, now, packet, encrypt, flowId, false);
}
}
return true;
@ -1102,7 +1127,7 @@ bool Switch::_trySend(void* tPtr, Packet& packet, bool encrypt, int32_t flowId)
}
if (viaPath) {
uint16_t userSpecifiedMtu = viaPath->mtu();
_sendViaSpecificPath(tPtr, peer, viaPath, userSpecifiedMtu, now, packet, encrypt, flowId);
_sendViaSpecificPath(tPtr, peer, viaPath, userSpecifiedMtu, now, packet, encrypt, flowId, false);
return true;
}
}
@ -1110,7 +1135,7 @@ bool Switch::_trySend(void* tPtr, Packet& packet, bool encrypt, int32_t flowId)
return false;
}
void Switch::_sendViaSpecificPath(void* tPtr, SharedPtr<Peer> peer, SharedPtr<Path> viaPath, uint16_t userSpecifiedMtu, int64_t now, Packet& packet, bool encrypt, int32_t flowId)
void Switch::_sendViaSpecificPath(void* tPtr, SharedPtr<Peer> peer, SharedPtr<Path> viaPath, uint16_t userSpecifiedMtu, int64_t now, Packet& packet, bool encrypt, int32_t flowId, bool was_fragmented_at_vl2)
{
unsigned int mtu = ZT_DEFAULT_PHYSMTU;
uint64_t trustedPathId = 0;
@ -1137,6 +1162,11 @@ void Switch::_sendViaSpecificPath(void* tPtr, SharedPtr<Peer> peer, SharedPtr<Pa
if (viaPath->send(RR, tPtr, packet.data(), chunkSize, now)) {
if (chunkSize < packet.size()) {
// Too big for one packet, fragment the rest
Metrics::vl1_fragments_per_packet_hist.Observe(2);
if (was_fragmented_at_vl2) {
Metrics::vl1_vl2_double_fragmentation_tx++;
}
unsigned int fragStart = chunkSize;
unsigned int remaining = packet.size() - chunkSize;
unsigned int fragsRemaining = (remaining / (mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH));
@ -1144,6 +1174,7 @@ void Switch::_sendViaSpecificPath(void* tPtr, SharedPtr<Peer> peer, SharedPtr<Pa
++fragsRemaining;
}
const unsigned int totalFragments = fragsRemaining + 1;
Metrics::vl1_fragments_per_packet_hist.Observe(totalFragments);
for (unsigned int fno = 1; fno < totalFragments; ++fno) {
chunkSize = std::min(remaining, (unsigned int)(mtu - ZT_PROTO_MIN_FRAGMENT_LENGTH));

View file

@ -206,7 +206,7 @@ class Switch {
private:
bool _shouldUnite(const int64_t now, const Address& source, const Address& destination);
bool _trySend(void* tPtr, Packet& packet, bool encrypt, int32_t flowId = ZT_QOS_NO_FLOW); // packet is modified if return is true
void _sendViaSpecificPath(void* tPtr, SharedPtr<Peer> peer, SharedPtr<Path> viaPath, uint16_t userSpecifiedMtu, int64_t now, Packet& packet, bool encrypt, int32_t flowId);
void _sendViaSpecificPath(void* tPtr, SharedPtr<Peer> peer, SharedPtr<Path> viaPath, uint16_t userSpecifiedMtu, int64_t now, Packet& packet, bool encrypt, int32_t flowId, bool was_fragmented_at_vl2);
void _recordOutgoingPacketMetrics(const Packet& p);
const RuntimeEnvironment* const RR;

View file

@ -51,6 +51,13 @@
#include <unistd.h>
#include <utility>
#include "../node/Constants.hpp"
#include "../node/Utils.hpp"
#include "../node/Mutex.hpp"
#include "OSUtils.hpp"
#include "BSDEthernetTap.hpp"
#include "../node/Metrics.hpp"
#define ZT_BASE32_CHARS "0123456789abcdefghijklmnopqrstuv"
#define ZT_TAP_BUF_SIZE (1024 * 16)
@ -353,6 +360,11 @@ std::vector<InetAddress> BSDEthernetTap::ips() const
void BSDEthernetTap::put(const MAC& from, const MAC& to, unsigned int etherType, const void* data, unsigned int len)
{
// VL2 frame size histogram
Metrics::vl2_frame_size_hist.Observe(len);
if (len > this->_mtu) {
Metrics::vl2_would_fragment_or_drop_rx++;
}
char putBuf[ZT_MAX_MTU + 64];
if ((_fd > 0) && (len <= _mtu) && (_enabled)) {
to.copyTo(putBuf, 6);

View file

@ -16,6 +16,7 @@
#endif
#include "../node/Constants.hpp"
#include "../node/Metrics.hpp"
#ifdef __LINUX__
@ -507,6 +508,11 @@ std::vector<InetAddress> LinuxEthernetTap::ips() const
void LinuxEthernetTap::put(const MAC& from, const MAC& to, unsigned int etherType, const void* data, unsigned int len)
{
// VL2 frame size histogram
ZeroTier::Metrics::vl2_frame_size_hist.Observe(len);
if (len > this->_mtu) {
ZeroTier::Metrics::vl2_would_fragment_or_drop_rx++;
}
char putBuf[ZT_MAX_MTU + 64];
if ((_fd > 0) && (len <= _mtu) && (_enabled)) {
to.copyTo(putBuf, 6);

View file

@ -22,6 +22,7 @@
#include "MacEthernetTap.hpp"
#include "MacEthernetTapAgent.h"
#include "OSUtils.hpp"
#include "../node/Metrics.hpp"
#include <algorithm>
#include <arpa/inet.h>
@ -393,6 +394,11 @@ std::vector<InetAddress> MacEthernetTap::ips() const
void MacEthernetTap::put(const MAC& from, const MAC& to, unsigned int etherType, const void* data, unsigned int len)
{
// VL2 frame size histogram
Metrics::vl2_frame_size_hist.Observe(len);
if (len > this->_mtu) {
Metrics::vl2_would_fragment_or_drop_rx++;
}
struct iovec iov[3];
unsigned char hdr[15];
uint16_t l;

View file

@ -51,6 +51,15 @@
#include <sys/wait.h>
#include <unistd.h>
#include <utility>
#include "../node/Constants.hpp"
#include "../node/Utils.hpp"
#include "../node/Mutex.hpp"
#include "OSUtils.hpp"
#include "NetBSDEthernetTap.hpp"
#include "../node/Metrics.hpp"
#include <iostream>
using namespace std;
#define ZT_BASE32_CHARS "0123456789abcdefghijklmnopqrstuv"
@ -328,6 +337,12 @@ std::vector<InetAddress> NetBSDEthernetTap::ips() const
void NetBSDEthernetTap::put(const MAC& from, const MAC& to, unsigned int etherType, const void* data, unsigned int len)
{
// VL2 frame size histogram
Metrics::vl2_frame_size_hist.Observe(len);
if (len > this->_mtu) {
Metrics::vl2_would_fragment_or_drop_rx++;
}
char putBuf[4096];
if ((_fd > 0) && (len <= _mtu) && (_enabled)) {
to.copyTo(putBuf, 6);

View file

@ -16,9 +16,10 @@
#include "../node/Constants.hpp"
#include "../node/Mutex.hpp"
#include "../node/Utils.hpp"
#include "..\windows\TapDriver6\tap-windows.h"
#include "../windows/TapDriver6/tap-windows.h"
#include "OSUtils.hpp"
#include "WinDNSHelper.hpp"
#include "../node/Metrics.hpp"
#include <IPHlpApi.h>
#include <SetupAPI.h>
@ -816,7 +817,14 @@ std::vector<InetAddress> WindowsEthernetTap::ips() const
void WindowsEthernetTap::put(const MAC& from, const MAC& to, unsigned int etherType, const void* data, unsigned int len)
{
if ((! _initialized) || (! _enabled) || (_tap == INVALID_HANDLE_VALUE) || (len > _mtu))
// Check MTU and add to histogram
ZeroTier::Metrics::vl2_frame_size_hist.Observe(len);
if (len > this->_mtu) {
ZeroTier::Metrics::vl2_would_fragment_or_drop_rx++;
return;
}
if ((! _initialized) || (! _enabled) || (_tap == INVALID_HANDLE_VALUE))
return;
Mutex::Lock _l(_injectPending_m);