upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/src/sync/metrics.rs
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-10 01:24:52 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-10 01:24:52 +0000
commitc9b3b3bd8a04de139bcb0d0b83bf819c367ee8c8 (patch)
tree80cc2c1ec92d71637408a7588d8cb908f03fc4b6 /src/sync/metrics.rs
parent9369a2885f5a3f9e38c0a3f9fa3af6260513c8e4 (diff)
Implement relay naughty list feature
Add naughty list tracking for relays with persistent infrastructure issues (DNS failures, TLS certificate errors, protocol violations) to reduce log noise and provide better visibility via metrics. Key features: - Classify errors into naughty (persistent) vs transient (temporary) - Track naughty relays with category, reason, and occurrence count - Log WARN on first naughty occurrence, DEBUG on repeats - Automatic expiration after 12 hours (configurable) - Prometheus metrics for monitoring naughty relays by category - Periodic cleanup task integrated with health checker Components added: - src/sync/naughty_list.rs: Core naughty list tracker with error classification - NaughtyListTracker integration in RelayHealthTracker - Connection error handling updates in sync manager - Naughty list metrics (total by category, detailed info per relay) - Config option for naughty_list_expiration_hours (default: 12) Closes DNS lookup failures and TLS certificate errors tracking issues.
Diffstat (limited to 'src/sync/metrics.rs')
-rw-r--r--src/sync/metrics.rs77
1 files changed, 77 insertions, 0 deletions
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs
index 13211b9..8a05f57 100644
--- a/src/sync/metrics.rs
+++ b/src/sync/metrics.rs
@@ -56,6 +56,12 @@ pub struct SyncMetrics {
56 rejected_cold_index_expired_total: IntCounterVec, 56 rejected_cold_index_expired_total: IntCounterVec,
57 /// Total invalidations (by event_type: announcement, state) 57 /// Total invalidations (by event_type: announcement, state)
58 rejected_invalidated_total: IntCounterVec, 58 rejected_invalidated_total: IntCounterVec,
59
60 // === Naughty List Metrics ===
61 /// Number of relays on naughty list by category
62 naughty_relays_total: IntGaugeVec,
63 /// Detailed info about naughty relays (relay, category, reason)
64 naughty_relay_info: IntGaugeVec,
59} 65}
60 66
61impl SyncMetrics { 67impl SyncMetrics {
@@ -193,6 +199,25 @@ impl SyncMetrics {
193 )?; 199 )?;
194 registry.register(Box::new(rejected_invalidated_total.clone()))?; 200 registry.register(Box::new(rejected_invalidated_total.clone()))?;
195 201
202 // Naughty list metrics
203 let naughty_relays_total = IntGaugeVec::new(
204 Opts::new(
205 "ngit_sync_naughty_relays_total",
206 "Number of relays on naughty list by category",
207 ),
208 &["category"],
209 )?;
210 registry.register(Box::new(naughty_relays_total.clone()))?;
211
212 let naughty_relay_info = IntGaugeVec::new(
213 Opts::new(
214 "ngit_sync_naughty_relay_info",
215 "Detailed info about naughty relays (occurrence count)",
216 ),
217 &["relay", "category", "reason"],
218 )?;
219 registry.register(Box::new(naughty_relay_info.clone()))?;
220
196 Ok(Self { 221 Ok(Self {
197 relay_connected, 222 relay_connected,
198 connection_attempts_total, 223 connection_attempts_total,
@@ -209,6 +234,8 @@ impl SyncMetrics {
209 rejected_cold_index_current, 234 rejected_cold_index_current,
210 rejected_cold_index_expired_total, 235 rejected_cold_index_expired_total,
211 rejected_invalidated_total, 236 rejected_invalidated_total,
237 naughty_relays_total,
238 naughty_relay_info,
212 }) 239 })
213 } 240 }
214 241
@@ -465,6 +492,56 @@ impl SyncMetrics {
465 .with_label_values(&[event_type]) 492 .with_label_values(&[event_type])
466 .inc_by(count as u64); 493 .inc_by(count as u64);
467 } 494 }
495
496 // === Naughty List Recording Methods ===
497
498 /// Update naughty list metrics from current naughty list state
499 ///
500 /// This method resets and rebuilds all naughty list metrics based on
501 /// the provided entries. Should be called periodically to keep metrics
502 /// in sync with the naughty list state.
503 ///
504 /// # Arguments
505 ///
506 /// * `entries` - Vector of (relay_url, naughty_entry) tuples from NaughtyListTracker::get_all()
507 pub fn update_naughty_list(&self, entries: Vec<(String, super::naughty_list::NaughtyEntry)>) {
508 use super::naughty_list::NaughtyCategory;
509
510 // Reset all naughty list metrics
511 self.naughty_relays_total.reset();
512 self.naughty_relay_info.reset();
513
514 // Count by category
515 let mut dns_count = 0;
516 let mut tls_count = 0;
517 let mut protocol_count = 0;
518
519 // Update metrics for each naughty relay
520 for (url, entry) in entries {
521 // Update category counts
522 match entry.category {
523 NaughtyCategory::DnsLookupFailed => dns_count += 1,
524 NaughtyCategory::TlsCertificateInvalid => tls_count += 1,
525 NaughtyCategory::ProtocolError => protocol_count += 1,
526 }
527
528 // Update detailed info (occurrence count)
529 self.naughty_relay_info
530 .with_label_values(&[&url, entry.category.as_str(), &entry.reason])
531 .set(entry.occurrence_count as i64);
532 }
533
534 // Set category totals
535 self.naughty_relays_total
536 .with_label_values(&["dns_lookup_failed"])
537 .set(dns_count);
538 self.naughty_relays_total
539 .with_label_values(&["tls_certificate_invalid"])
540 .set(tls_count);
541 self.naughty_relays_total
542 .with_label_values(&["protocol_error"])
543 .set(protocol_count);
544 }
468} 545}
469 546
470#[cfg(test)] 547#[cfg(test)]