diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
| commit | c9b3b3bd8a04de139bcb0d0b83bf819c367ee8c8 (patch) | |
| tree | 80cc2c1ec92d71637408a7588d8cb908f03fc4b6 /src | |
| parent | 9369a2885f5a3f9e38c0a3f9fa3af6260513c8e4 (diff) | |
Implement relay naughty list feature
Add naughty list tracking for relays with persistent infrastructure issues
(DNS failures, TLS certificate errors, protocol violations) to reduce log
noise and provide better visibility via metrics.
Key features:
- Classify errors into naughty (persistent) vs transient (temporary)
- Track naughty relays with category, reason, and occurrence count
- Log WARN on first naughty occurrence, DEBUG on repeats
- Automatic expiration after 12 hours (configurable)
- Prometheus metrics for monitoring naughty relays by category
- Periodic cleanup task integrated with health checker
Components added:
- src/sync/naughty_list.rs: Core naughty list tracker with error classification
- NaughtyListTracker integration in RelayHealthTracker
- Connection error handling updates in sync manager
- Naughty list metrics (total by category, detailed info per relay)
- Config option for naughty_list_expiration_hours (default: 12)
Closes DNS lookup failures and TLS certificate errors tracking issues.
Diffstat (limited to 'src')
| -rw-r--r-- | src/config.rs | 7 | ||||
| -rw-r--r-- | src/sync/health.rs | 15 | ||||
| -rw-r--r-- | src/sync/metrics.rs | 77 | ||||
| -rw-r--r-- | src/sync/mod.rs | 53 | ||||
| -rw-r--r-- | src/sync/naughty_list.rs | 546 |
5 files changed, 696 insertions, 2 deletions
diff --git a/src/config.rs b/src/config.rs index 44001d8..74327c9 100644 --- a/src/config.rs +++ b/src/config.rs | |||
| @@ -156,6 +156,12 @@ pub struct Config { | |||
| 156 | default_value_t = 604800 | 156 | default_value_t = 604800 |
| 157 | )] | 157 | )] |
| 158 | pub rejected_cold_index_expiry_secs: u64, | 158 | pub rejected_cold_index_expiry_secs: u64, |
| 159 | |||
| 160 | /// Hours before removing relay from naughty list (default: 12) | ||
| 161 | /// Relays with persistent infrastructure issues (DNS, TLS, protocol errors) are | ||
| 162 | /// tracked separately and retried after this expiration period. | ||
| 163 | #[arg(long, env = "NGIT_NAUGHTY_LIST_EXPIRATION_HOURS", default_value_t = 12)] | ||
| 164 | pub naughty_list_expiration_hours: u64, | ||
| 159 | } | 165 | } |
| 160 | 166 | ||
| 161 | impl Config { | 167 | impl Config { |
| @@ -281,6 +287,7 @@ impl Config { | |||
| 281 | sync_disable_negentropy: false, | 287 | sync_disable_negentropy: false, |
| 282 | rejected_hot_cache_duration_secs: 120, | 288 | rejected_hot_cache_duration_secs: 120, |
| 283 | rejected_cold_index_expiry_secs: 604800, | 289 | rejected_cold_index_expiry_secs: 604800, |
| 290 | naughty_list_expiration_hours: 12, | ||
| 284 | } | 291 | } |
| 285 | } | 292 | } |
| 286 | } | 293 | } |
diff --git a/src/sync/health.rs b/src/sync/health.rs index 2948707..833918b 100644 --- a/src/sync/health.rs +++ b/src/sync/health.rs | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | //! - Exponential backoff with configurable max delay | 5 | //! - Exponential backoff with configurable max delay |
| 6 | //! - Dead relay detection after 24h of continuous failures | 6 | //! - Dead relay detection after 24h of continuous failures |
| 7 | //! - Rate limit detection and fixed cooldown period | 7 | //! - Rate limit detection and fixed cooldown period |
| 8 | //! - Naughty list for persistent infrastructure issues (DNS, TLS, protocol errors) | ||
| 8 | //! | 9 | //! |
| 9 | //! ## Health States | 10 | //! ## Health States |
| 10 | //! | 11 | //! |
| @@ -18,6 +19,7 @@ use std::time::{Duration, Instant}; | |||
| 18 | 19 | ||
| 19 | use dashmap::DashMap; | 20 | use dashmap::DashMap; |
| 20 | 21 | ||
| 22 | use super::naughty_list::NaughtyListTracker; | ||
| 21 | use crate::config::Config; | 23 | use crate::config::Config; |
| 22 | 24 | ||
| 23 | /// Duration threshold before a relay is considered dead (24 hours) | 25 | /// Duration threshold before a relay is considered dead (24 hours) |
| @@ -213,15 +215,21 @@ pub struct RelayHealthTracker { | |||
| 213 | health: DashMap<String, RelayHealth>, | 215 | health: DashMap<String, RelayHealth>, |
| 214 | max_backoff_secs: u64, | 216 | max_backoff_secs: u64, |
| 215 | base_backoff_secs: u64, | 217 | base_backoff_secs: u64, |
| 218 | naughty_list: Option<Arc<NaughtyListTracker>>, | ||
| 216 | } | 219 | } |
| 217 | 220 | ||
| 218 | impl RelayHealthTracker { | 221 | impl RelayHealthTracker { |
| 219 | /// Create a new RelayHealthTracker | 222 | /// Create a new RelayHealthTracker |
| 220 | pub fn new(config: &Config) -> Self { | 223 | pub fn new(config: &Config) -> Self { |
| 224 | let naughty_list = Some(Arc::new(NaughtyListTracker::new( | ||
| 225 | config.naughty_list_expiration_hours, | ||
| 226 | ))); | ||
| 227 | |||
| 221 | Self { | 228 | Self { |
| 222 | health: DashMap::new(), | 229 | health: DashMap::new(), |
| 223 | max_backoff_secs: config.sync_max_backoff_secs, | 230 | max_backoff_secs: config.sync_max_backoff_secs, |
| 224 | base_backoff_secs: config.sync_base_backoff_secs, | 231 | base_backoff_secs: config.sync_base_backoff_secs, |
| 232 | naughty_list, | ||
| 225 | } | 233 | } |
| 226 | } | 234 | } |
| 227 | 235 | ||
| @@ -231,6 +239,7 @@ impl RelayHealthTracker { | |||
| 231 | health: DashMap::new(), | 239 | health: DashMap::new(), |
| 232 | max_backoff_secs: DEFAULT_MAX_BACKOFF_SECS, | 240 | max_backoff_secs: DEFAULT_MAX_BACKOFF_SECS, |
| 233 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, | 241 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, |
| 242 | naughty_list: Some(Arc::new(NaughtyListTracker::with_defaults())), | ||
| 234 | } | 243 | } |
| 235 | } | 244 | } |
| 236 | 245 | ||
| @@ -240,6 +249,7 @@ impl RelayHealthTracker { | |||
| 240 | health: DashMap::new(), | 249 | health: DashMap::new(), |
| 241 | max_backoff_secs, | 250 | max_backoff_secs, |
| 242 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, | 251 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, |
| 252 | naughty_list: Some(Arc::new(NaughtyListTracker::with_defaults())), | ||
| 243 | } | 253 | } |
| 244 | } | 254 | } |
| 245 | 255 | ||
| @@ -549,6 +559,11 @@ impl RelayHealthTracker { | |||
| 549 | .get(relay_url) | 559 | .get(relay_url) |
| 550 | .map(|entry| entry.value().clone()) | 560 | .map(|entry| entry.value().clone()) |
| 551 | } | 561 | } |
| 562 | |||
| 563 | /// Get a reference to the naughty list tracker | ||
| 564 | pub fn naughty_list(&self) -> Option<Arc<NaughtyListTracker>> { | ||
| 565 | self.naughty_list.clone() | ||
| 566 | } | ||
| 552 | } | 567 | } |
| 553 | 568 | ||
| 554 | /// Create a shared RelayHealthTracker wrapped in Arc | 569 | /// Create a shared RelayHealthTracker wrapped in Arc |
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs index 13211b9..8a05f57 100644 --- a/src/sync/metrics.rs +++ b/src/sync/metrics.rs | |||
| @@ -56,6 +56,12 @@ pub struct SyncMetrics { | |||
| 56 | rejected_cold_index_expired_total: IntCounterVec, | 56 | rejected_cold_index_expired_total: IntCounterVec, |
| 57 | /// Total invalidations (by event_type: announcement, state) | 57 | /// Total invalidations (by event_type: announcement, state) |
| 58 | rejected_invalidated_total: IntCounterVec, | 58 | rejected_invalidated_total: IntCounterVec, |
| 59 | |||
| 60 | // === Naughty List Metrics === | ||
| 61 | /// Number of relays on naughty list by category | ||
| 62 | naughty_relays_total: IntGaugeVec, | ||
| 63 | /// Detailed info about naughty relays (relay, category, reason) | ||
| 64 | naughty_relay_info: IntGaugeVec, | ||
| 59 | } | 65 | } |
| 60 | 66 | ||
| 61 | impl SyncMetrics { | 67 | impl SyncMetrics { |
| @@ -193,6 +199,25 @@ impl SyncMetrics { | |||
| 193 | )?; | 199 | )?; |
| 194 | registry.register(Box::new(rejected_invalidated_total.clone()))?; | 200 | registry.register(Box::new(rejected_invalidated_total.clone()))?; |
| 195 | 201 | ||
| 202 | // Naughty list metrics | ||
| 203 | let naughty_relays_total = IntGaugeVec::new( | ||
| 204 | Opts::new( | ||
| 205 | "ngit_sync_naughty_relays_total", | ||
| 206 | "Number of relays on naughty list by category", | ||
| 207 | ), | ||
| 208 | &["category"], | ||
| 209 | )?; | ||
| 210 | registry.register(Box::new(naughty_relays_total.clone()))?; | ||
| 211 | |||
| 212 | let naughty_relay_info = IntGaugeVec::new( | ||
| 213 | Opts::new( | ||
| 214 | "ngit_sync_naughty_relay_info", | ||
| 215 | "Detailed info about naughty relays (occurrence count)", | ||
| 216 | ), | ||
| 217 | &["relay", "category", "reason"], | ||
| 218 | )?; | ||
| 219 | registry.register(Box::new(naughty_relay_info.clone()))?; | ||
| 220 | |||
| 196 | Ok(Self { | 221 | Ok(Self { |
| 197 | relay_connected, | 222 | relay_connected, |
| 198 | connection_attempts_total, | 223 | connection_attempts_total, |
| @@ -209,6 +234,8 @@ impl SyncMetrics { | |||
| 209 | rejected_cold_index_current, | 234 | rejected_cold_index_current, |
| 210 | rejected_cold_index_expired_total, | 235 | rejected_cold_index_expired_total, |
| 211 | rejected_invalidated_total, | 236 | rejected_invalidated_total, |
| 237 | naughty_relays_total, | ||
| 238 | naughty_relay_info, | ||
| 212 | }) | 239 | }) |
| 213 | } | 240 | } |
| 214 | 241 | ||
| @@ -465,6 +492,56 @@ impl SyncMetrics { | |||
| 465 | .with_label_values(&[event_type]) | 492 | .with_label_values(&[event_type]) |
| 466 | .inc_by(count as u64); | 493 | .inc_by(count as u64); |
| 467 | } | 494 | } |
| 495 | |||
| 496 | // === Naughty List Recording Methods === | ||
| 497 | |||
| 498 | /// Update naughty list metrics from current naughty list state | ||
| 499 | /// | ||
| 500 | /// This method resets and rebuilds all naughty list metrics based on | ||
| 501 | /// the provided entries. Should be called periodically to keep metrics | ||
| 502 | /// in sync with the naughty list state. | ||
| 503 | /// | ||
| 504 | /// # Arguments | ||
| 505 | /// | ||
| 506 | /// * `entries` - Vector of (relay_url, naughty_entry) tuples from NaughtyListTracker::get_all() | ||
| 507 | pub fn update_naughty_list(&self, entries: Vec<(String, super::naughty_list::NaughtyEntry)>) { | ||
| 508 | use super::naughty_list::NaughtyCategory; | ||
| 509 | |||
| 510 | // Reset all naughty list metrics | ||
| 511 | self.naughty_relays_total.reset(); | ||
| 512 | self.naughty_relay_info.reset(); | ||
| 513 | |||
| 514 | // Count by category | ||
| 515 | let mut dns_count = 0; | ||
| 516 | let mut tls_count = 0; | ||
| 517 | let mut protocol_count = 0; | ||
| 518 | |||
| 519 | // Update metrics for each naughty relay | ||
| 520 | for (url, entry) in entries { | ||
| 521 | // Update category counts | ||
| 522 | match entry.category { | ||
| 523 | NaughtyCategory::DnsLookupFailed => dns_count += 1, | ||
| 524 | NaughtyCategory::TlsCertificateInvalid => tls_count += 1, | ||
| 525 | NaughtyCategory::ProtocolError => protocol_count += 1, | ||
| 526 | } | ||
| 527 | |||
| 528 | // Update detailed info (occurrence count) | ||
| 529 | self.naughty_relay_info | ||
| 530 | .with_label_values(&[&url, entry.category.as_str(), &entry.reason]) | ||
| 531 | .set(entry.occurrence_count as i64); | ||
| 532 | } | ||
| 533 | |||
| 534 | // Set category totals | ||
| 535 | self.naughty_relays_total | ||
| 536 | .with_label_values(&["dns_lookup_failed"]) | ||
| 537 | .set(dns_count); | ||
| 538 | self.naughty_relays_total | ||
| 539 | .with_label_values(&["tls_certificate_invalid"]) | ||
| 540 | .set(tls_count); | ||
| 541 | self.naughty_relays_total | ||
| 542 | .with_label_values(&["protocol_error"]) | ||
| 543 | .set(protocol_count); | ||
| 544 | } | ||
| 468 | } | 545 | } |
| 469 | 546 | ||
| 470 | #[cfg(test)] | 547 | #[cfg(test)] |
diff --git a/src/sync/mod.rs b/src/sync/mod.rs index 412cd16..8b51fac 100644 --- a/src/sync/mod.rs +++ b/src/sync/mod.rs | |||
| @@ -16,6 +16,7 @@ pub mod algorithms; | |||
| 16 | pub mod filters; | 16 | pub mod filters; |
| 17 | pub mod health; | 17 | pub mod health; |
| 18 | pub mod metrics; | 18 | pub mod metrics; |
| 19 | pub mod naughty_list; | ||
| 19 | pub mod rejected_index; | 20 | pub mod rejected_index; |
| 20 | pub mod relay_connection; | 21 | pub mod relay_connection; |
| 21 | pub mod self_subscriber; | 22 | pub mod self_subscriber; |
| @@ -483,7 +484,18 @@ async fn run_health_and_metrics_checker( | |||
| 483 | // 2. Check for rate limit recovery | 484 | // 2. Check for rate limit recovery |
| 484 | manager.check_rate_limit_recovery().await; | 485 | manager.check_rate_limit_recovery().await; |
| 485 | 486 | ||
| 486 | // 3. Update metrics with current health states | 487 | // 3. Check for naughty list expiration |
| 488 | if let Some(naughty_list) = manager.health_tracker.naughty_list() { | ||
| 489 | let recovered = naughty_list.expire_old_entries(); | ||
| 490 | for url in recovered { | ||
| 491 | tracing::info!( | ||
| 492 | relay = %url, | ||
| 493 | "Relay removed from naughty list after expiration, will retry" | ||
| 494 | ); | ||
| 495 | } | ||
| 496 | } | ||
| 497 | |||
| 498 | // 4. Update metrics with current health states and naughty list | ||
| 487 | if let Some(ref metrics) = manager.metrics { | 499 | if let Some(ref metrics) = manager.metrics { |
| 488 | // Get all tracked relay URLs | 500 | // Get all tracked relay URLs |
| 489 | let relay_urls: Vec<String> = { | 501 | let relay_urls: Vec<String> = { |
| @@ -496,6 +508,12 @@ async fn run_health_and_metrics_checker( | |||
| 496 | let state = manager.health_tracker.get_state(&relay_url); | 508 | let state = manager.health_tracker.get_state(&relay_url); |
| 497 | metrics.record_health_state(&relay_url, state); | 509 | metrics.record_health_state(&relay_url, state); |
| 498 | } | 510 | } |
| 511 | |||
| 512 | // Update naughty list metrics | ||
| 513 | if let Some(naughty_list) = manager.health_tracker.naughty_list() { | ||
| 514 | let entries = naughty_list.get_all(); | ||
| 515 | metrics.update_naughty_list(entries); | ||
| 516 | } | ||
| 499 | } | 517 | } |
| 500 | } | 518 | } |
| 501 | _ = shutdown_rx.recv() => { | 519 | _ = shutdown_rx.recv() => { |
| @@ -2018,7 +2036,38 @@ impl SyncManager { | |||
| 2018 | } | 2036 | } |
| 2019 | } | 2037 | } |
| 2020 | Err(e) => { | 2038 | Err(e) => { |
| 2021 | tracing::error!(relay = %relay_url, error = %e, "Connection failed"); | 2039 | // Classify error to determine if it's a naughty relay or transient issue |
| 2040 | let error_str = e.to_string(); | ||
| 2041 | |||
| 2042 | if let Some(category) = naughty_list::NaughtyListTracker::classify_error(&error_str) | ||
| 2043 | { | ||
| 2044 | // Persistent infrastructure issue - use naughty list | ||
| 2045 | if let Some(ref naughty_list) = self.health_tracker.naughty_list() { | ||
| 2046 | let is_new = naughty_list.record(relay_url, category, error_str.clone()); | ||
| 2047 | |||
| 2048 | if is_new { | ||
| 2049 | tracing::warn!( | ||
| 2050 | relay = %relay_url, | ||
| 2051 | category = ?category, | ||
| 2052 | error = %e, | ||
| 2053 | "Relay has persistent configuration issue, added to naughty list" | ||
| 2054 | ); | ||
| 2055 | } else { | ||
| 2056 | tracing::debug!( | ||
| 2057 | relay = %relay_url, | ||
| 2058 | category = ?category, | ||
| 2059 | "Naughty relay failure (already tracked)" | ||
| 2060 | ); | ||
| 2061 | } | ||
| 2062 | } | ||
| 2063 | } else { | ||
| 2064 | // Transient network issue - use existing backoff flow | ||
| 2065 | tracing::debug!( | ||
| 2066 | relay = %relay_url, | ||
| 2067 | error = %e, | ||
| 2068 | "Connection failed (transient issue, backoff active)" | ||
| 2069 | ); | ||
| 2070 | } | ||
| 2022 | 2071 | ||
| 2023 | // 4. Update state back to Disconnected on failure | 2072 | // 4. Update state back to Disconnected on failure |
| 2024 | { | 2073 | { |
diff --git a/src/sync/naughty_list.rs b/src/sync/naughty_list.rs new file mode 100644 index 0000000..311b9bb --- /dev/null +++ b/src/sync/naughty_list.rs | |||
| @@ -0,0 +1,546 @@ | |||
| 1 | //! Naughty List Tracker for Relays with Persistent Infrastructure Issues | ||
| 2 | //! | ||
| 3 | //! This module tracks relays with persistent configuration/infrastructure problems | ||
| 4 | //! (DNS failures, TLS certificate errors, protocol violations) separately from | ||
| 5 | //! transient network issues (timeouts, connection refused). | ||
| 6 | //! | ||
| 7 | //! ## Failure Classification | ||
| 8 | //! | ||
| 9 | //! **Naughty List (12-hour expiration, log WARN on first occurrence, DEBUG on repeat):** | ||
| 10 | //! - `DnsLookupFailed`: Domain doesn't resolve or DNS errors | ||
| 11 | //! - `TlsCertificateInvalid`: Certificate errors (expired, mismatch, self-signed) | ||
| 12 | //! - `ProtocolError`: WebSocket/Nostr protocol violations | ||
| 13 | //! | ||
| 14 | //! **NOT Naughty (use existing HealthTracker backoff):** | ||
| 15 | //! - Connection timeouts (could be network congestion) | ||
| 16 | //! - Connection refused (could be temporary maintenance) | ||
| 17 | //! | ||
| 18 | //! ## Automatic Expiration | ||
| 19 | //! | ||
| 20 | //! Entries expire after 12 hours (configurable) to allow relays to recover from | ||
| 21 | //! infrastructure issues. After expiration, the relay is automatically retried. | ||
| 22 | |||
| 23 | use dashmap::DashMap; | ||
| 24 | use std::time::Instant; | ||
| 25 | |||
| 26 | /// Category of persistent relay failure that qualifies for the naughty list | ||
| 27 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||
| 28 | pub enum NaughtyCategory { | ||
| 29 | /// DNS lookup failures (domain doesn't resolve) | ||
| 30 | DnsLookupFailed, | ||
| 31 | /// TLS certificate errors (expired, invalid, mismatch) | ||
| 32 | TlsCertificateInvalid, | ||
| 33 | /// WebSocket or Nostr protocol violations | ||
| 34 | ProtocolError, | ||
| 35 | } | ||
| 36 | |||
| 37 | impl NaughtyCategory { | ||
| 38 | /// Get string representation for metrics labels | ||
| 39 | pub fn as_str(&self) -> &'static str { | ||
| 40 | match self { | ||
| 41 | NaughtyCategory::DnsLookupFailed => "dns_lookup_failed", | ||
| 42 | NaughtyCategory::TlsCertificateInvalid => "tls_certificate_invalid", | ||
| 43 | NaughtyCategory::ProtocolError => "protocol_error", | ||
| 44 | } | ||
| 45 | } | ||
| 46 | } | ||
| 47 | |||
| 48 | impl std::fmt::Display for NaughtyCategory { | ||
| 49 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
| 50 | write!(f, "{}", self.as_str()) | ||
| 51 | } | ||
| 52 | } | ||
| 53 | |||
| 54 | /// Naughty list entry for a relay with persistent issues | ||
| 55 | #[derive(Debug, Clone)] | ||
| 56 | pub struct NaughtyEntry { | ||
| 57 | /// Category of the persistent failure | ||
| 58 | pub category: NaughtyCategory, | ||
| 59 | /// Full error message | ||
| 60 | pub reason: String, | ||
| 61 | /// When this relay was first added to the naughty list | ||
| 62 | pub first_seen: Instant, | ||
| 63 | /// Most recent occurrence of the issue | ||
| 64 | pub last_seen: Instant, | ||
| 65 | /// Number of times we've seen this issue | ||
| 66 | pub occurrence_count: u32, | ||
| 67 | } | ||
| 68 | |||
| 69 | /// Tracks relays with persistent infrastructure/configuration issues | ||
| 70 | /// | ||
| 71 | /// Separate from HealthTracker's backoff logic - this is specifically for | ||
| 72 | /// relays with configuration problems that are unlikely to be fixed quickly. | ||
| 73 | #[derive(Debug)] | ||
| 74 | pub struct NaughtyListTracker { | ||
| 75 | /// Map of relay URL to naughty entry | ||
| 76 | entries: DashMap<String, NaughtyEntry>, | ||
| 77 | /// How many hours before removing a relay from the naughty list | ||
| 78 | expiration_hours: u64, | ||
| 79 | } | ||
| 80 | |||
| 81 | impl NaughtyListTracker { | ||
| 82 | /// Create a new NaughtyListTracker with the specified expiration time | ||
| 83 | /// | ||
| 84 | /// # Arguments | ||
| 85 | /// | ||
| 86 | /// * `expiration_hours` - Hours before a naughty entry expires (default: 12) | ||
| 87 | pub fn new(expiration_hours: u64) -> Self { | ||
| 88 | Self { | ||
| 89 | entries: DashMap::new(), | ||
| 90 | expiration_hours, | ||
| 91 | } | ||
| 92 | } | ||
| 93 | |||
| 94 | /// Create a new NaughtyListTracker with default 12-hour expiration | ||
| 95 | pub fn with_defaults() -> Self { | ||
| 96 | Self::new(12) | ||
| 97 | } | ||
| 98 | |||
| 99 | /// Classify an error string into a naughty category or return None for transient errors | ||
| 100 | /// | ||
| 101 | /// # Arguments | ||
| 102 | /// | ||
| 103 | /// * `error` - The error message string to classify | ||
| 104 | /// | ||
| 105 | /// # Returns | ||
| 106 | /// | ||
| 107 | /// - `Some(NaughtyCategory)` if the error indicates a persistent infrastructure issue | ||
| 108 | /// - `None` if the error is a transient network issue (use HealthTracker backoff) | ||
| 109 | pub fn classify_error(error: &str) -> Option<NaughtyCategory> { | ||
| 110 | let error_lower = error.to_lowercase(); | ||
| 111 | |||
| 112 | // DNS lookup failures | ||
| 113 | if error_lower.contains("failed to lookup address") | ||
| 114 | || error_lower.contains("name or service not known") | ||
| 115 | || error_lower.contains("nodename nor servname provided") | ||
| 116 | || (error_lower.contains("dns") && !error_lower.contains("timeout")) | ||
| 117 | { | ||
| 118 | return Some(NaughtyCategory::DnsLookupFailed); | ||
| 119 | } | ||
| 120 | |||
| 121 | // TLS certificate errors | ||
| 122 | if error_lower.contains("certificate") | ||
| 123 | || error_lower.contains("ssl") | ||
| 124 | || error_lower.contains("tls") | ||
| 125 | { | ||
| 126 | // Exclude timeout errors that mention TLS | ||
| 127 | if !error_lower.contains("timeout") && !error_lower.contains("timed out") { | ||
| 128 | return Some(NaughtyCategory::TlsCertificateInvalid); | ||
| 129 | } | ||
| 130 | } | ||
| 131 | |||
| 132 | // Protocol errors | ||
| 133 | if error_lower.contains("websocket") | ||
| 134 | || error_lower.contains("protocol") | ||
| 135 | || error_lower.contains("invalid frame") | ||
| 136 | { | ||
| 137 | // Exclude connection errors | ||
| 138 | if !error_lower.contains("connection") | ||
| 139 | && !error_lower.contains("timeout") | ||
| 140 | && !error_lower.contains("refused") | ||
| 141 | { | ||
| 142 | return Some(NaughtyCategory::ProtocolError); | ||
| 143 | } | ||
| 144 | } | ||
| 145 | |||
| 146 | // Everything else is transient (timeouts, refused, etc.) | ||
| 147 | None | ||
| 148 | } | ||
| 149 | |||
| 150 | /// Record a naughty relay (adds new entry or updates existing) | ||
| 151 | /// | ||
| 152 | /// # Arguments | ||
| 153 | /// | ||
| 154 | /// * `relay_url` - The relay URL | ||
| 155 | /// * `category` - The naughty category | ||
| 156 | /// * `reason` - The full error message | ||
| 157 | /// | ||
| 158 | /// # Returns | ||
| 159 | /// | ||
| 160 | /// `true` if this is a new naughty entry (first occurrence), `false` if updating existing | ||
| 161 | pub fn record(&self, relay_url: &str, category: NaughtyCategory, reason: String) -> bool { | ||
| 162 | let now = Instant::now(); | ||
| 163 | |||
| 164 | if let Some(mut entry) = self.entries.get_mut(relay_url) { | ||
| 165 | // Update existing entry | ||
| 166 | entry.last_seen = now; | ||
| 167 | entry.occurrence_count = entry.occurrence_count.saturating_add(1); | ||
| 168 | entry.reason = reason; // Update with latest error message | ||
| 169 | false | ||
| 170 | } else { | ||
| 171 | // Create new entry | ||
| 172 | self.entries.insert( | ||
| 173 | relay_url.to_string(), | ||
| 174 | NaughtyEntry { | ||
| 175 | category, | ||
| 176 | reason, | ||
| 177 | first_seen: now, | ||
| 178 | last_seen: now, | ||
| 179 | occurrence_count: 1, | ||
| 180 | }, | ||
| 181 | ); | ||
| 182 | true | ||
| 183 | } | ||
| 184 | } | ||
| 185 | |||
| 186 | /// Check if a relay is on the naughty list (not expired) | ||
| 187 | /// | ||
| 188 | /// # Arguments | ||
| 189 | /// | ||
| 190 | /// * `relay_url` - The relay URL to check | ||
| 191 | /// | ||
| 192 | /// # Returns | ||
| 193 | /// | ||
| 194 | /// `true` if the relay is currently on the naughty list | ||
| 195 | pub fn is_naughty(&self, relay_url: &str) -> bool { | ||
| 196 | if let Some(entry) = self.entries.get(relay_url) { | ||
| 197 | let age = Instant::now().duration_since(entry.first_seen); | ||
| 198 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); | ||
| 199 | age < expiration | ||
| 200 | } else { | ||
| 201 | false | ||
| 202 | } | ||
| 203 | } | ||
| 204 | |||
| 205 | /// Get a naughty entry if it exists and hasn't expired | ||
| 206 | /// | ||
| 207 | /// # Arguments | ||
| 208 | /// | ||
| 209 | /// * `relay_url` - The relay URL to look up | ||
| 210 | /// | ||
| 211 | /// # Returns | ||
| 212 | /// | ||
| 213 | /// A cloned `NaughtyEntry` if the relay is on the naughty list and not expired | ||
| 214 | pub fn get_entry(&self, relay_url: &str) -> Option<NaughtyEntry> { | ||
| 215 | self.entries.get(relay_url).map(|e| e.clone()) | ||
| 216 | } | ||
| 217 | |||
| 218 | /// Remove expired entries from the naughty list | ||
| 219 | /// | ||
| 220 | /// Entries older than `expiration_hours` are removed to allow relays | ||
| 221 | /// to be retried after infrastructure issues are potentially fixed. | ||
| 222 | /// | ||
| 223 | /// # Returns | ||
| 224 | /// | ||
| 225 | /// Vector of relay URLs that were removed from the naughty list | ||
| 226 | pub fn expire_old_entries(&self) -> Vec<String> { | ||
| 227 | let now = Instant::now(); | ||
| 228 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); | ||
| 229 | let mut expired = Vec::new(); | ||
| 230 | |||
| 231 | // Collect expired relay URLs | ||
| 232 | self.entries.retain(|url, entry| { | ||
| 233 | let age = now.duration_since(entry.first_seen); | ||
| 234 | if age >= expiration { | ||
| 235 | expired.push(url.clone()); | ||
| 236 | false // Remove this entry | ||
| 237 | } else { | ||
| 238 | true // Keep this entry | ||
| 239 | } | ||
| 240 | }); | ||
| 241 | |||
| 242 | expired | ||
| 243 | } | ||
| 244 | |||
| 245 | /// Get all naughty relays (for metrics and monitoring) | ||
| 246 | /// | ||
| 247 | /// # Returns | ||
| 248 | /// | ||
| 249 | /// Vector of (relay_url, entry) tuples for all relays currently on the naughty list | ||
| 250 | pub fn get_all(&self) -> Vec<(String, NaughtyEntry)> { | ||
| 251 | self.entries | ||
| 252 | .iter() | ||
| 253 | .map(|entry| (entry.key().clone(), entry.value().clone())) | ||
| 254 | .collect() | ||
| 255 | } | ||
| 256 | |||
| 257 | /// Get the count of relays in a specific category | ||
| 258 | /// | ||
| 259 | /// # Arguments | ||
| 260 | /// | ||
| 261 | /// * `category` - The category to count | ||
| 262 | /// | ||
| 263 | /// # Returns | ||
| 264 | /// | ||
| 265 | /// Number of relays in the specified category | ||
| 266 | pub fn count_by_category(&self, category: NaughtyCategory) -> usize { | ||
| 267 | self.entries | ||
| 268 | .iter() | ||
| 269 | .filter(|entry| entry.value().category == category) | ||
| 270 | .count() | ||
| 271 | } | ||
| 272 | |||
| 273 | /// Get total number of relays on the naughty list | ||
| 274 | pub fn total_count(&self) -> usize { | ||
| 275 | self.entries.len() | ||
| 276 | } | ||
| 277 | } | ||
| 278 | |||
| 279 | #[cfg(test)] | ||
| 280 | mod tests { | ||
| 281 | use super::*; | ||
| 282 | |||
| 283 | #[test] | ||
| 284 | fn test_classify_dns_errors() { | ||
| 285 | assert_eq!( | ||
| 286 | NaughtyListTracker::classify_error("failed to lookup address information"), | ||
| 287 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 288 | ); | ||
| 289 | assert_eq!( | ||
| 290 | NaughtyListTracker::classify_error("Name or service not known"), | ||
| 291 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 292 | ); | ||
| 293 | assert_eq!( | ||
| 294 | NaughtyListTracker::classify_error("nodename nor servname provided"), | ||
| 295 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 296 | ); | ||
| 297 | assert_eq!( | ||
| 298 | NaughtyListTracker::classify_error("dns error: NXDOMAIN"), | ||
| 299 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 300 | ); | ||
| 301 | } | ||
| 302 | |||
| 303 | #[test] | ||
| 304 | fn test_classify_tls_errors() { | ||
| 305 | assert_eq!( | ||
| 306 | NaughtyListTracker::classify_error("certificate not valid for 'example.com'"), | ||
| 307 | Some(NaughtyCategory::TlsCertificateInvalid) | ||
| 308 | ); | ||
| 309 | assert_eq!( | ||
| 310 | NaughtyListTracker::classify_error("SSL certificate problem"), | ||
| 311 | Some(NaughtyCategory::TlsCertificateInvalid) | ||
| 312 | ); | ||
| 313 | assert_eq!( | ||
| 314 | NaughtyListTracker::classify_error("TLS handshake failed"), | ||
| 315 | Some(NaughtyCategory::TlsCertificateInvalid) | ||
| 316 | ); | ||
| 317 | |||
| 318 | // TLS timeout should NOT be classified as naughty | ||
| 319 | assert_eq!( | ||
| 320 | NaughtyListTracker::classify_error("TLS connection timed out"), | ||
| 321 | None | ||
| 322 | ); | ||
| 323 | } | ||
| 324 | |||
| 325 | #[test] | ||
| 326 | fn test_classify_protocol_errors() { | ||
| 327 | assert_eq!( | ||
| 328 | NaughtyListTracker::classify_error("websocket protocol error"), | ||
| 329 | Some(NaughtyCategory::ProtocolError) | ||
| 330 | ); | ||
| 331 | assert_eq!( | ||
| 332 | NaughtyListTracker::classify_error("invalid frame header"), | ||
| 333 | Some(NaughtyCategory::ProtocolError) | ||
| 334 | ); | ||
| 335 | |||
| 336 | // WebSocket connection errors should NOT be classified as naughty | ||
| 337 | assert_eq!( | ||
| 338 | NaughtyListTracker::classify_error("websocket connection refused"), | ||
| 339 | None | ||
| 340 | ); | ||
| 341 | } | ||
| 342 | |||
| 343 | #[test] | ||
| 344 | fn test_classify_transient_errors() { | ||
| 345 | // Timeouts are transient | ||
| 346 | assert_eq!( | ||
| 347 | NaughtyListTracker::classify_error("connection timed out"), | ||
| 348 | None | ||
| 349 | ); | ||
| 350 | assert_eq!( | ||
| 351 | NaughtyListTracker::classify_error("operation timed out"), | ||
| 352 | None | ||
| 353 | ); | ||
| 354 | |||
| 355 | // Connection refused is transient | ||
| 356 | assert_eq!( | ||
| 357 | NaughtyListTracker::classify_error("connection refused"), | ||
| 358 | None | ||
| 359 | ); | ||
| 360 | |||
| 361 | // Generic network errors are transient | ||
| 362 | assert_eq!( | ||
| 363 | NaughtyListTracker::classify_error("network unreachable"), | ||
| 364 | None | ||
| 365 | ); | ||
| 366 | } | ||
| 367 | |||
| 368 | #[test] | ||
| 369 | fn test_record_new_entry() { | ||
| 370 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 371 | let url = "wss://bad-relay.example.com"; | ||
| 372 | |||
| 373 | let is_new = tracker.record( | ||
| 374 | url, | ||
| 375 | NaughtyCategory::DnsLookupFailed, | ||
| 376 | "failed to lookup address".to_string(), | ||
| 377 | ); | ||
| 378 | |||
| 379 | assert!(is_new); | ||
| 380 | assert!(tracker.is_naughty(url)); | ||
| 381 | |||
| 382 | let entry = tracker.get_entry(url).unwrap(); | ||
| 383 | assert_eq!(entry.category, NaughtyCategory::DnsLookupFailed); | ||
| 384 | assert_eq!(entry.occurrence_count, 1); | ||
| 385 | } | ||
| 386 | |||
| 387 | #[test] | ||
| 388 | fn test_record_updates_existing() { | ||
| 389 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 390 | let url = "wss://bad-relay.example.com"; | ||
| 391 | |||
| 392 | // First occurrence | ||
| 393 | let is_new1 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 1".to_string()); | ||
| 394 | assert!(is_new1); | ||
| 395 | |||
| 396 | // Second occurrence | ||
| 397 | let is_new2 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 2".to_string()); | ||
| 398 | assert!(!is_new2); | ||
| 399 | |||
| 400 | let entry = tracker.get_entry(url).unwrap(); | ||
| 401 | assert_eq!(entry.occurrence_count, 2); | ||
| 402 | assert_eq!(entry.reason, "error 2"); // Updated to latest | ||
| 403 | } | ||
| 404 | |||
| 405 | #[test] | ||
| 406 | fn test_is_naughty() { | ||
| 407 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 408 | let url = "wss://bad-relay.example.com"; | ||
| 409 | |||
| 410 | assert!(!tracker.is_naughty(url)); | ||
| 411 | |||
| 412 | tracker.record( | ||
| 413 | url, | ||
| 414 | NaughtyCategory::TlsCertificateInvalid, | ||
| 415 | "cert error".to_string(), | ||
| 416 | ); | ||
| 417 | |||
| 418 | assert!(tracker.is_naughty(url)); | ||
| 419 | } | ||
| 420 | |||
| 421 | #[test] | ||
| 422 | fn test_get_all() { | ||
| 423 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 424 | |||
| 425 | tracker.record( | ||
| 426 | "wss://relay1.example.com", | ||
| 427 | NaughtyCategory::DnsLookupFailed, | ||
| 428 | "dns error".to_string(), | ||
| 429 | ); | ||
| 430 | tracker.record( | ||
| 431 | "wss://relay2.example.com", | ||
| 432 | NaughtyCategory::TlsCertificateInvalid, | ||
| 433 | "tls error".to_string(), | ||
| 434 | ); | ||
| 435 | |||
| 436 | let all = tracker.get_all(); | ||
| 437 | assert_eq!(all.len(), 2); | ||
| 438 | } | ||
| 439 | |||
| 440 | #[test] | ||
| 441 | fn test_count_by_category() { | ||
| 442 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 443 | |||
| 444 | tracker.record( | ||
| 445 | "wss://relay1.example.com", | ||
| 446 | NaughtyCategory::DnsLookupFailed, | ||
| 447 | "error".to_string(), | ||
| 448 | ); | ||
| 449 | tracker.record( | ||
| 450 | "wss://relay2.example.com", | ||
| 451 | NaughtyCategory::DnsLookupFailed, | ||
| 452 | "error".to_string(), | ||
| 453 | ); | ||
| 454 | tracker.record( | ||
| 455 | "wss://relay3.example.com", | ||
| 456 | NaughtyCategory::TlsCertificateInvalid, | ||
| 457 | "error".to_string(), | ||
| 458 | ); | ||
| 459 | |||
| 460 | assert_eq!( | ||
| 461 | tracker.count_by_category(NaughtyCategory::DnsLookupFailed), | ||
| 462 | 2 | ||
| 463 | ); | ||
| 464 | assert_eq!( | ||
| 465 | tracker.count_by_category(NaughtyCategory::TlsCertificateInvalid), | ||
| 466 | 1 | ||
| 467 | ); | ||
| 468 | assert_eq!(tracker.count_by_category(NaughtyCategory::ProtocolError), 0); | ||
| 469 | } | ||
| 470 | |||
| 471 | #[test] | ||
| 472 | fn test_total_count() { | ||
| 473 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 474 | assert_eq!(tracker.total_count(), 0); | ||
| 475 | |||
| 476 | tracker.record( | ||
| 477 | "wss://relay1.example.com", | ||
| 478 | NaughtyCategory::DnsLookupFailed, | ||
| 479 | "error".to_string(), | ||
| 480 | ); | ||
| 481 | assert_eq!(tracker.total_count(), 1); | ||
| 482 | |||
| 483 | tracker.record( | ||
| 484 | "wss://relay2.example.com", | ||
| 485 | NaughtyCategory::TlsCertificateInvalid, | ||
| 486 | "error".to_string(), | ||
| 487 | ); | ||
| 488 | assert_eq!(tracker.total_count(), 2); | ||
| 489 | } | ||
| 490 | |||
| 491 | #[test] | ||
| 492 | fn test_expire_old_entries() { | ||
| 493 | // Use very short expiration for testing | ||
| 494 | let tracker = NaughtyListTracker::new(0); // Expire immediately (0 hours) | ||
| 495 | |||
| 496 | tracker.record( | ||
| 497 | "wss://relay1.example.com", | ||
| 498 | NaughtyCategory::DnsLookupFailed, | ||
| 499 | "error".to_string(), | ||
| 500 | ); | ||
| 501 | |||
| 502 | // Entry should exist in the map | ||
| 503 | assert_eq!(tracker.total_count(), 1); | ||
| 504 | |||
| 505 | // But is_naughty should return false since it's already expired (0 hours) | ||
| 506 | assert!(!tracker.is_naughty("wss://relay1.example.com")); | ||
| 507 | |||
| 508 | // Sleep to ensure time passes | ||
| 509 | std::thread::sleep(std::time::Duration::from_millis(10)); | ||
| 510 | |||
| 511 | // Expire old entries (should remove the 0-hour expired entry) | ||
| 512 | let expired = tracker.expire_old_entries(); | ||
| 513 | assert_eq!(expired.len(), 1); | ||
| 514 | assert_eq!(expired[0], "wss://relay1.example.com"); | ||
| 515 | |||
| 516 | // Entry should be gone | ||
| 517 | assert!(!tracker.is_naughty("wss://relay1.example.com")); | ||
| 518 | assert_eq!(tracker.total_count(), 0); | ||
| 519 | } | ||
| 520 | |||
| 521 | #[test] | ||
| 522 | fn test_category_display() { | ||
| 523 | assert_eq!( | ||
| 524 | NaughtyCategory::DnsLookupFailed.to_string(), | ||
| 525 | "dns_lookup_failed" | ||
| 526 | ); | ||
| 527 | assert_eq!( | ||
| 528 | NaughtyCategory::TlsCertificateInvalid.to_string(), | ||
| 529 | "tls_certificate_invalid" | ||
| 530 | ); | ||
| 531 | assert_eq!(NaughtyCategory::ProtocolError.to_string(), "protocol_error"); | ||
| 532 | } | ||
| 533 | |||
| 534 | #[test] | ||
| 535 | fn test_category_as_str() { | ||
| 536 | assert_eq!( | ||
| 537 | NaughtyCategory::DnsLookupFailed.as_str(), | ||
| 538 | "dns_lookup_failed" | ||
| 539 | ); | ||
| 540 | assert_eq!( | ||
| 541 | NaughtyCategory::TlsCertificateInvalid.as_str(), | ||
| 542 | "tls_certificate_invalid" | ||
| 543 | ); | ||
| 544 | assert_eq!(NaughtyCategory::ProtocolError.as_str(), "protocol_error"); | ||
| 545 | } | ||
| 546 | } | ||