diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
| commit | c9b3b3bd8a04de139bcb0d0b83bf819c367ee8c8 (patch) | |
| tree | 80cc2c1ec92d71637408a7588d8cb908f03fc4b6 /src/sync/metrics.rs | |
| parent | 9369a2885f5a3f9e38c0a3f9fa3af6260513c8e4 (diff) | |
Implement relay naughty list feature
Add naughty list tracking for relays with persistent infrastructure issues
(DNS failures, TLS certificate errors, protocol violations) to reduce log
noise and provide better visibility via metrics.
Key features:
- Classify errors into naughty (persistent) vs transient (temporary)
- Track naughty relays with category, reason, and occurrence count
- Log WARN on first naughty occurrence, DEBUG on repeats
- Automatic expiration after 12 hours (configurable)
- Prometheus metrics for monitoring naughty relays by category
- Periodic cleanup task integrated with health checker
Components added:
- src/sync/naughty_list.rs: Core naughty list tracker with error classification
- NaughtyListTracker integration in RelayHealthTracker
- Connection error handling updates in sync manager
- Naughty list metrics (total by category, detailed info per relay)
- Config option for naughty_list_expiration_hours (default: 12)
Closes DNS lookup failures and TLS certificate errors tracking issues.
Diffstat (limited to 'src/sync/metrics.rs')
| -rw-r--r-- | src/sync/metrics.rs | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs index 13211b9..8a05f57 100644 --- a/src/sync/metrics.rs +++ b/src/sync/metrics.rs | |||
| @@ -56,6 +56,12 @@ pub struct SyncMetrics { | |||
| 56 | rejected_cold_index_expired_total: IntCounterVec, | 56 | rejected_cold_index_expired_total: IntCounterVec, |
| 57 | /// Total invalidations (by event_type: announcement, state) | 57 | /// Total invalidations (by event_type: announcement, state) |
| 58 | rejected_invalidated_total: IntCounterVec, | 58 | rejected_invalidated_total: IntCounterVec, |
| 59 | |||
| 60 | // === Naughty List Metrics === | ||
| 61 | /// Number of relays on naughty list by category | ||
| 62 | naughty_relays_total: IntGaugeVec, | ||
| 63 | /// Detailed info about naughty relays (relay, category, reason) | ||
| 64 | naughty_relay_info: IntGaugeVec, | ||
| 59 | } | 65 | } |
| 60 | 66 | ||
| 61 | impl SyncMetrics { | 67 | impl SyncMetrics { |
| @@ -193,6 +199,25 @@ impl SyncMetrics { | |||
| 193 | )?; | 199 | )?; |
| 194 | registry.register(Box::new(rejected_invalidated_total.clone()))?; | 200 | registry.register(Box::new(rejected_invalidated_total.clone()))?; |
| 195 | 201 | ||
| 202 | // Naughty list metrics | ||
| 203 | let naughty_relays_total = IntGaugeVec::new( | ||
| 204 | Opts::new( | ||
| 205 | "ngit_sync_naughty_relays_total", | ||
| 206 | "Number of relays on naughty list by category", | ||
| 207 | ), | ||
| 208 | &["category"], | ||
| 209 | )?; | ||
| 210 | registry.register(Box::new(naughty_relays_total.clone()))?; | ||
| 211 | |||
| 212 | let naughty_relay_info = IntGaugeVec::new( | ||
| 213 | Opts::new( | ||
| 214 | "ngit_sync_naughty_relay_info", | ||
| 215 | "Detailed info about naughty relays (occurrence count)", | ||
| 216 | ), | ||
| 217 | &["relay", "category", "reason"], | ||
| 218 | )?; | ||
| 219 | registry.register(Box::new(naughty_relay_info.clone()))?; | ||
| 220 | |||
| 196 | Ok(Self { | 221 | Ok(Self { |
| 197 | relay_connected, | 222 | relay_connected, |
| 198 | connection_attempts_total, | 223 | connection_attempts_total, |
| @@ -209,6 +234,8 @@ impl SyncMetrics { | |||
| 209 | rejected_cold_index_current, | 234 | rejected_cold_index_current, |
| 210 | rejected_cold_index_expired_total, | 235 | rejected_cold_index_expired_total, |
| 211 | rejected_invalidated_total, | 236 | rejected_invalidated_total, |
| 237 | naughty_relays_total, | ||
| 238 | naughty_relay_info, | ||
| 212 | }) | 239 | }) |
| 213 | } | 240 | } |
| 214 | 241 | ||
| @@ -465,6 +492,56 @@ impl SyncMetrics { | |||
| 465 | .with_label_values(&[event_type]) | 492 | .with_label_values(&[event_type]) |
| 466 | .inc_by(count as u64); | 493 | .inc_by(count as u64); |
| 467 | } | 494 | } |
| 495 | |||
| 496 | // === Naughty List Recording Methods === | ||
| 497 | |||
| 498 | /// Update naughty list metrics from current naughty list state | ||
| 499 | /// | ||
| 500 | /// This method resets and rebuilds all naughty list metrics based on | ||
| 501 | /// the provided entries. Should be called periodically to keep metrics | ||
| 502 | /// in sync with the naughty list state. | ||
| 503 | /// | ||
| 504 | /// # Arguments | ||
| 505 | /// | ||
| 506 | /// * `entries` - Vector of (relay_url, naughty_entry) tuples from NaughtyListTracker::get_all() | ||
| 507 | pub fn update_naughty_list(&self, entries: Vec<(String, super::naughty_list::NaughtyEntry)>) { | ||
| 508 | use super::naughty_list::NaughtyCategory; | ||
| 509 | |||
| 510 | // Reset all naughty list metrics | ||
| 511 | self.naughty_relays_total.reset(); | ||
| 512 | self.naughty_relay_info.reset(); | ||
| 513 | |||
| 514 | // Count by category | ||
| 515 | let mut dns_count = 0; | ||
| 516 | let mut tls_count = 0; | ||
| 517 | let mut protocol_count = 0; | ||
| 518 | |||
| 519 | // Update metrics for each naughty relay | ||
| 520 | for (url, entry) in entries { | ||
| 521 | // Update category counts | ||
| 522 | match entry.category { | ||
| 523 | NaughtyCategory::DnsLookupFailed => dns_count += 1, | ||
| 524 | NaughtyCategory::TlsCertificateInvalid => tls_count += 1, | ||
| 525 | NaughtyCategory::ProtocolError => protocol_count += 1, | ||
| 526 | } | ||
| 527 | |||
| 528 | // Update detailed info (occurrence count) | ||
| 529 | self.naughty_relay_info | ||
| 530 | .with_label_values(&[&url, entry.category.as_str(), &entry.reason]) | ||
| 531 | .set(entry.occurrence_count as i64); | ||
| 532 | } | ||
| 533 | |||
| 534 | // Set category totals | ||
| 535 | self.naughty_relays_total | ||
| 536 | .with_label_values(&["dns_lookup_failed"]) | ||
| 537 | .set(dns_count); | ||
| 538 | self.naughty_relays_total | ||
| 539 | .with_label_values(&["tls_certificate_invalid"]) | ||
| 540 | .set(tls_count); | ||
| 541 | self.naughty_relays_total | ||
| 542 | .with_label_values(&["protocol_error"]) | ||
| 543 | .set(protocol_count); | ||
| 544 | } | ||
| 468 | } | 545 | } |
| 469 | 546 | ||
| 470 | #[cfg(test)] | 547 | #[cfg(test)] |