diff options
| -rw-r--r-- | src/config.rs | 7 | ||||
| -rw-r--r-- | src/sync/health.rs | 15 | ||||
| -rw-r--r-- | src/sync/metrics.rs | 77 | ||||
| -rw-r--r-- | src/sync/mod.rs | 53 | ||||
| -rw-r--r-- | src/sync/naughty_list.rs | 546 |
5 files changed, 696 insertions, 2 deletions
diff --git a/src/config.rs b/src/config.rs index 44001d8..74327c9 100644 --- a/src/config.rs +++ b/src/config.rs | |||
| @@ -156,6 +156,12 @@ pub struct Config { | |||
| 156 | default_value_t = 604800 | 156 | default_value_t = 604800 |
| 157 | )] | 157 | )] |
| 158 | pub rejected_cold_index_expiry_secs: u64, | 158 | pub rejected_cold_index_expiry_secs: u64, |
| 159 | |||
| 160 | /// Hours before removing relay from naughty list (default: 12) | ||
| 161 | /// Relays with persistent infrastructure issues (DNS, TLS, protocol errors) are | ||
| 162 | /// tracked separately and retried after this expiration period. | ||
| 163 | #[arg(long, env = "NGIT_NAUGHTY_LIST_EXPIRATION_HOURS", default_value_t = 12)] | ||
| 164 | pub naughty_list_expiration_hours: u64, | ||
| 159 | } | 165 | } |
| 160 | 166 | ||
| 161 | impl Config { | 167 | impl Config { |
| @@ -281,6 +287,7 @@ impl Config { | |||
| 281 | sync_disable_negentropy: false, | 287 | sync_disable_negentropy: false, |
| 282 | rejected_hot_cache_duration_secs: 120, | 288 | rejected_hot_cache_duration_secs: 120, |
| 283 | rejected_cold_index_expiry_secs: 604800, | 289 | rejected_cold_index_expiry_secs: 604800, |
| 290 | naughty_list_expiration_hours: 12, | ||
| 284 | } | 291 | } |
| 285 | } | 292 | } |
| 286 | } | 293 | } |
diff --git a/src/sync/health.rs b/src/sync/health.rs index 2948707..833918b 100644 --- a/src/sync/health.rs +++ b/src/sync/health.rs | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | //! - Exponential backoff with configurable max delay | 5 | //! - Exponential backoff with configurable max delay |
| 6 | //! - Dead relay detection after 24h of continuous failures | 6 | //! - Dead relay detection after 24h of continuous failures |
| 7 | //! - Rate limit detection and fixed cooldown period | 7 | //! - Rate limit detection and fixed cooldown period |
| 8 | //! - Naughty list for persistent infrastructure issues (DNS, TLS, protocol errors) | ||
| 8 | //! | 9 | //! |
| 9 | //! ## Health States | 10 | //! ## Health States |
| 10 | //! | 11 | //! |
| @@ -18,6 +19,7 @@ use std::time::{Duration, Instant}; | |||
| 18 | 19 | ||
| 19 | use dashmap::DashMap; | 20 | use dashmap::DashMap; |
| 20 | 21 | ||
| 22 | use super::naughty_list::NaughtyListTracker; | ||
| 21 | use crate::config::Config; | 23 | use crate::config::Config; |
| 22 | 24 | ||
| 23 | /// Duration threshold before a relay is considered dead (24 hours) | 25 | /// Duration threshold before a relay is considered dead (24 hours) |
| @@ -213,15 +215,21 @@ pub struct RelayHealthTracker { | |||
| 213 | health: DashMap<String, RelayHealth>, | 215 | health: DashMap<String, RelayHealth>, |
| 214 | max_backoff_secs: u64, | 216 | max_backoff_secs: u64, |
| 215 | base_backoff_secs: u64, | 217 | base_backoff_secs: u64, |
| 218 | naughty_list: Option<Arc<NaughtyListTracker>>, | ||
| 216 | } | 219 | } |
| 217 | 220 | ||
| 218 | impl RelayHealthTracker { | 221 | impl RelayHealthTracker { |
| 219 | /// Create a new RelayHealthTracker | 222 | /// Create a new RelayHealthTracker |
| 220 | pub fn new(config: &Config) -> Self { | 223 | pub fn new(config: &Config) -> Self { |
| 224 | let naughty_list = Some(Arc::new(NaughtyListTracker::new( | ||
| 225 | config.naughty_list_expiration_hours, | ||
| 226 | ))); | ||
| 227 | |||
| 221 | Self { | 228 | Self { |
| 222 | health: DashMap::new(), | 229 | health: DashMap::new(), |
| 223 | max_backoff_secs: config.sync_max_backoff_secs, | 230 | max_backoff_secs: config.sync_max_backoff_secs, |
| 224 | base_backoff_secs: config.sync_base_backoff_secs, | 231 | base_backoff_secs: config.sync_base_backoff_secs, |
| 232 | naughty_list, | ||
| 225 | } | 233 | } |
| 226 | } | 234 | } |
| 227 | 235 | ||
| @@ -231,6 +239,7 @@ impl RelayHealthTracker { | |||
| 231 | health: DashMap::new(), | 239 | health: DashMap::new(), |
| 232 | max_backoff_secs: DEFAULT_MAX_BACKOFF_SECS, | 240 | max_backoff_secs: DEFAULT_MAX_BACKOFF_SECS, |
| 233 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, | 241 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, |
| 242 | naughty_list: Some(Arc::new(NaughtyListTracker::with_defaults())), | ||
| 234 | } | 243 | } |
| 235 | } | 244 | } |
| 236 | 245 | ||
| @@ -240,6 +249,7 @@ impl RelayHealthTracker { | |||
| 240 | health: DashMap::new(), | 249 | health: DashMap::new(), |
| 241 | max_backoff_secs, | 250 | max_backoff_secs, |
| 242 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, | 251 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, |
| 252 | naughty_list: Some(Arc::new(NaughtyListTracker::with_defaults())), | ||
| 243 | } | 253 | } |
| 244 | } | 254 | } |
| 245 | 255 | ||
| @@ -549,6 +559,11 @@ impl RelayHealthTracker { | |||
| 549 | .get(relay_url) | 559 | .get(relay_url) |
| 550 | .map(|entry| entry.value().clone()) | 560 | .map(|entry| entry.value().clone()) |
| 551 | } | 561 | } |
| 562 | |||
| 563 | /// Get a reference to the naughty list tracker | ||
| 564 | pub fn naughty_list(&self) -> Option<Arc<NaughtyListTracker>> { | ||
| 565 | self.naughty_list.clone() | ||
| 566 | } | ||
| 552 | } | 567 | } |
| 553 | 568 | ||
| 554 | /// Create a shared RelayHealthTracker wrapped in Arc | 569 | /// Create a shared RelayHealthTracker wrapped in Arc |
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs index 13211b9..8a05f57 100644 --- a/src/sync/metrics.rs +++ b/src/sync/metrics.rs | |||
| @@ -56,6 +56,12 @@ pub struct SyncMetrics { | |||
| 56 | rejected_cold_index_expired_total: IntCounterVec, | 56 | rejected_cold_index_expired_total: IntCounterVec, |
| 57 | /// Total invalidations (by event_type: announcement, state) | 57 | /// Total invalidations (by event_type: announcement, state) |
| 58 | rejected_invalidated_total: IntCounterVec, | 58 | rejected_invalidated_total: IntCounterVec, |
| 59 | |||
| 60 | // === Naughty List Metrics === | ||
| 61 | /// Number of relays on naughty list by category | ||
| 62 | naughty_relays_total: IntGaugeVec, | ||
| 63 | /// Detailed info about naughty relays (relay, category, reason) | ||
| 64 | naughty_relay_info: IntGaugeVec, | ||
| 59 | } | 65 | } |
| 60 | 66 | ||
| 61 | impl SyncMetrics { | 67 | impl SyncMetrics { |
| @@ -193,6 +199,25 @@ impl SyncMetrics { | |||
| 193 | )?; | 199 | )?; |
| 194 | registry.register(Box::new(rejected_invalidated_total.clone()))?; | 200 | registry.register(Box::new(rejected_invalidated_total.clone()))?; |
| 195 | 201 | ||
| 202 | // Naughty list metrics | ||
| 203 | let naughty_relays_total = IntGaugeVec::new( | ||
| 204 | Opts::new( | ||
| 205 | "ngit_sync_naughty_relays_total", | ||
| 206 | "Number of relays on naughty list by category", | ||
| 207 | ), | ||
| 208 | &["category"], | ||
| 209 | )?; | ||
| 210 | registry.register(Box::new(naughty_relays_total.clone()))?; | ||
| 211 | |||
| 212 | let naughty_relay_info = IntGaugeVec::new( | ||
| 213 | Opts::new( | ||
| 214 | "ngit_sync_naughty_relay_info", | ||
| 215 | "Detailed info about naughty relays (occurrence count)", | ||
| 216 | ), | ||
| 217 | &["relay", "category", "reason"], | ||
| 218 | )?; | ||
| 219 | registry.register(Box::new(naughty_relay_info.clone()))?; | ||
| 220 | |||
| 196 | Ok(Self { | 221 | Ok(Self { |
| 197 | relay_connected, | 222 | relay_connected, |
| 198 | connection_attempts_total, | 223 | connection_attempts_total, |
| @@ -209,6 +234,8 @@ impl SyncMetrics { | |||
| 209 | rejected_cold_index_current, | 234 | rejected_cold_index_current, |
| 210 | rejected_cold_index_expired_total, | 235 | rejected_cold_index_expired_total, |
| 211 | rejected_invalidated_total, | 236 | rejected_invalidated_total, |
| 237 | naughty_relays_total, | ||
| 238 | naughty_relay_info, | ||
| 212 | }) | 239 | }) |
| 213 | } | 240 | } |
| 214 | 241 | ||
| @@ -465,6 +492,56 @@ impl SyncMetrics { | |||
| 465 | .with_label_values(&[event_type]) | 492 | .with_label_values(&[event_type]) |
| 466 | .inc_by(count as u64); | 493 | .inc_by(count as u64); |
| 467 | } | 494 | } |
| 495 | |||
| 496 | // === Naughty List Recording Methods === | ||
| 497 | |||
| 498 | /// Update naughty list metrics from current naughty list state | ||
| 499 | /// | ||
| 500 | /// This method resets and rebuilds all naughty list metrics based on | ||
| 501 | /// the provided entries. Should be called periodically to keep metrics | ||
| 502 | /// in sync with the naughty list state. | ||
| 503 | /// | ||
| 504 | /// # Arguments | ||
| 505 | /// | ||
| 506 | /// * `entries` - Vector of (relay_url, naughty_entry) tuples from NaughtyListTracker::get_all() | ||
| 507 | pub fn update_naughty_list(&self, entries: Vec<(String, super::naughty_list::NaughtyEntry)>) { | ||
| 508 | use super::naughty_list::NaughtyCategory; | ||
| 509 | |||
| 510 | // Reset all naughty list metrics | ||
| 511 | self.naughty_relays_total.reset(); | ||
| 512 | self.naughty_relay_info.reset(); | ||
| 513 | |||
| 514 | // Count by category | ||
| 515 | let mut dns_count = 0; | ||
| 516 | let mut tls_count = 0; | ||
| 517 | let mut protocol_count = 0; | ||
| 518 | |||
| 519 | // Update metrics for each naughty relay | ||
| 520 | for (url, entry) in entries { | ||
| 521 | // Update category counts | ||
| 522 | match entry.category { | ||
| 523 | NaughtyCategory::DnsLookupFailed => dns_count += 1, | ||
| 524 | NaughtyCategory::TlsCertificateInvalid => tls_count += 1, | ||
| 525 | NaughtyCategory::ProtocolError => protocol_count += 1, | ||
| 526 | } | ||
| 527 | |||
| 528 | // Update detailed info (occurrence count) | ||
| 529 | self.naughty_relay_info | ||
| 530 | .with_label_values(&[&url, entry.category.as_str(), &entry.reason]) | ||
| 531 | .set(entry.occurrence_count as i64); | ||
| 532 | } | ||
| 533 | |||
| 534 | // Set category totals | ||
| 535 | self.naughty_relays_total | ||
| 536 | .with_label_values(&["dns_lookup_failed"]) | ||
| 537 | .set(dns_count); | ||
| 538 | self.naughty_relays_total | ||
| 539 | .with_label_values(&["tls_certificate_invalid"]) | ||
| 540 | .set(tls_count); | ||
| 541 | self.naughty_relays_total | ||
| 542 | .with_label_values(&["protocol_error"]) | ||
| 543 | .set(protocol_count); | ||
| 544 | } | ||
| 468 | } | 545 | } |
| 469 | 546 | ||
| 470 | #[cfg(test)] | 547 | #[cfg(test)] |
diff --git a/src/sync/mod.rs b/src/sync/mod.rs index 412cd16..8b51fac 100644 --- a/src/sync/mod.rs +++ b/src/sync/mod.rs | |||
| @@ -16,6 +16,7 @@ pub mod algorithms; | |||
| 16 | pub mod filters; | 16 | pub mod filters; |
| 17 | pub mod health; | 17 | pub mod health; |
| 18 | pub mod metrics; | 18 | pub mod metrics; |
| 19 | pub mod naughty_list; | ||
| 19 | pub mod rejected_index; | 20 | pub mod rejected_index; |
| 20 | pub mod relay_connection; | 21 | pub mod relay_connection; |
| 21 | pub mod self_subscriber; | 22 | pub mod self_subscriber; |
| @@ -483,7 +484,18 @@ async fn run_health_and_metrics_checker( | |||
| 483 | // 2. Check for rate limit recovery | 484 | // 2. Check for rate limit recovery |
| 484 | manager.check_rate_limit_recovery().await; | 485 | manager.check_rate_limit_recovery().await; |
| 485 | 486 | ||
| 486 | // 3. Update metrics with current health states | 487 | // 3. Check for naughty list expiration |
| 488 | if let Some(naughty_list) = manager.health_tracker.naughty_list() { | ||
| 489 | let recovered = naughty_list.expire_old_entries(); | ||
| 490 | for url in recovered { | ||
| 491 | tracing::info!( | ||
| 492 | relay = %url, | ||
| 493 | "Relay removed from naughty list after expiration, will retry" | ||
| 494 | ); | ||
| 495 | } | ||
| 496 | } | ||
| 497 | |||
| 498 | // 4. Update metrics with current health states and naughty list | ||
| 487 | if let Some(ref metrics) = manager.metrics { | 499 | if let Some(ref metrics) = manager.metrics { |
| 488 | // Get all tracked relay URLs | 500 | // Get all tracked relay URLs |
| 489 | let relay_urls: Vec<String> = { | 501 | let relay_urls: Vec<String> = { |
| @@ -496,6 +508,12 @@ async fn run_health_and_metrics_checker( | |||
| 496 | let state = manager.health_tracker.get_state(&relay_url); | 508 | let state = manager.health_tracker.get_state(&relay_url); |
| 497 | metrics.record_health_state(&relay_url, state); | 509 | metrics.record_health_state(&relay_url, state); |
| 498 | } | 510 | } |
| 511 | |||
| 512 | // Update naughty list metrics | ||
| 513 | if let Some(naughty_list) = manager.health_tracker.naughty_list() { | ||
| 514 | let entries = naughty_list.get_all(); | ||
| 515 | metrics.update_naughty_list(entries); | ||
| 516 | } | ||
| 499 | } | 517 | } |
| 500 | } | 518 | } |
| 501 | _ = shutdown_rx.recv() => { | 519 | _ = shutdown_rx.recv() => { |
| @@ -2018,7 +2036,38 @@ impl SyncManager { | |||
| 2018 | } | 2036 | } |
| 2019 | } | 2037 | } |
| 2020 | Err(e) => { | 2038 | Err(e) => { |
| 2021 | tracing::error!(relay = %relay_url, error = %e, "Connection failed"); | 2039 | // Classify error to determine if it's a naughty relay or transient issue |
| 2040 | let error_str = e.to_string(); | ||
| 2041 | |||
| 2042 | if let Some(category) = naughty_list::NaughtyListTracker::classify_error(&error_str) | ||
| 2043 | { | ||
| 2044 | // Persistent infrastructure issue - use naughty list | ||
| 2045 | if let Some(ref naughty_list) = self.health_tracker.naughty_list() { | ||
| 2046 | let is_new = naughty_list.record(relay_url, category, error_str.clone()); | ||
| 2047 | |||
| 2048 | if is_new { | ||
| 2049 | tracing::warn!( | ||
| 2050 | relay = %relay_url, | ||
| 2051 | category = ?category, | ||
| 2052 | error = %e, | ||
| 2053 | "Relay has persistent configuration issue, added to naughty list" | ||
| 2054 | ); | ||
| 2055 | } else { | ||
| 2056 | tracing::debug!( | ||
| 2057 | relay = %relay_url, | ||
| 2058 | category = ?category, | ||
| 2059 | "Naughty relay failure (already tracked)" | ||
| 2060 | ); | ||
| 2061 | } | ||
| 2062 | } | ||
| 2063 | } else { | ||
| 2064 | // Transient network issue - use existing backoff flow | ||
| 2065 | tracing::debug!( | ||
| 2066 | relay = %relay_url, | ||
| 2067 | error = %e, | ||
| 2068 | "Connection failed (transient issue, backoff active)" | ||
| 2069 | ); | ||
| 2070 | } | ||
| 2022 | 2071 | ||
| 2023 | // 4. Update state back to Disconnected on failure | 2072 | // 4. Update state back to Disconnected on failure |
| 2024 | { | 2073 | { |
diff --git a/src/sync/naughty_list.rs b/src/sync/naughty_list.rs new file mode 100644 index 0000000..311b9bb --- /dev/null +++ b/src/sync/naughty_list.rs | |||
| @@ -0,0 +1,546 @@ | |||
| 1 | //! Naughty List Tracker for Relays with Persistent Infrastructure Issues | ||
| 2 | //! | ||
| 3 | //! This module tracks relays with persistent configuration/infrastructure problems | ||
| 4 | //! (DNS failures, TLS certificate errors, protocol violations) separately from | ||
| 5 | //! transient network issues (timeouts, connection refused). | ||
| 6 | //! | ||
| 7 | //! ## Failure Classification | ||
| 8 | //! | ||
| 9 | //! **Naughty List (12-hour expiration, log WARN on first occurrence, DEBUG on repeat):** | ||
| 10 | //! - `DnsLookupFailed`: Domain doesn't resolve or DNS errors | ||
| 11 | //! - `TlsCertificateInvalid`: Certificate errors (expired, mismatch, self-signed) | ||
| 12 | //! - `ProtocolError`: WebSocket/Nostr protocol violations | ||
| 13 | //! | ||
| 14 | //! **NOT Naughty (use existing HealthTracker backoff):** | ||
| 15 | //! - Connection timeouts (could be network congestion) | ||
| 16 | //! - Connection refused (could be temporary maintenance) | ||
| 17 | //! | ||
| 18 | //! ## Automatic Expiration | ||
| 19 | //! | ||
| 20 | //! Entries expire after 12 hours (configurable) to allow relays to recover from | ||
| 21 | //! infrastructure issues. After expiration, the relay is automatically retried. | ||
| 22 | |||
| 23 | use dashmap::DashMap; | ||
| 24 | use std::time::Instant; | ||
| 25 | |||
| 26 | /// Category of persistent relay failure that qualifies for the naughty list | ||
| 27 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||
| 28 | pub enum NaughtyCategory { | ||
| 29 | /// DNS lookup failures (domain doesn't resolve) | ||
| 30 | DnsLookupFailed, | ||
| 31 | /// TLS certificate errors (expired, invalid, mismatch) | ||
| 32 | TlsCertificateInvalid, | ||
| 33 | /// WebSocket or Nostr protocol violations | ||
| 34 | ProtocolError, | ||
| 35 | } | ||
| 36 | |||
| 37 | impl NaughtyCategory { | ||
| 38 | /// Get string representation for metrics labels | ||
| 39 | pub fn as_str(&self) -> &'static str { | ||
| 40 | match self { | ||
| 41 | NaughtyCategory::DnsLookupFailed => "dns_lookup_failed", | ||
| 42 | NaughtyCategory::TlsCertificateInvalid => "tls_certificate_invalid", | ||
| 43 | NaughtyCategory::ProtocolError => "protocol_error", | ||
| 44 | } | ||
| 45 | } | ||
| 46 | } | ||
| 47 | |||
| 48 | impl std::fmt::Display for NaughtyCategory { | ||
| 49 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
| 50 | write!(f, "{}", self.as_str()) | ||
| 51 | } | ||
| 52 | } | ||
| 53 | |||
| 54 | /// Naughty list entry for a relay with persistent issues | ||
| 55 | #[derive(Debug, Clone)] | ||
| 56 | pub struct NaughtyEntry { | ||
| 57 | /// Category of the persistent failure | ||
| 58 | pub category: NaughtyCategory, | ||
| 59 | /// Full error message | ||
| 60 | pub reason: String, | ||
| 61 | /// When this relay was first added to the naughty list | ||
| 62 | pub first_seen: Instant, | ||
| 63 | /// Most recent occurrence of the issue | ||
| 64 | pub last_seen: Instant, | ||
| 65 | /// Number of times we've seen this issue | ||
| 66 | pub occurrence_count: u32, | ||
| 67 | } | ||
| 68 | |||
| 69 | /// Tracks relays with persistent infrastructure/configuration issues | ||
| 70 | /// | ||
| 71 | /// Separate from HealthTracker's backoff logic - this is specifically for | ||
| 72 | /// relays with configuration problems that are unlikely to be fixed quickly. | ||
| 73 | #[derive(Debug)] | ||
| 74 | pub struct NaughtyListTracker { | ||
| 75 | /// Map of relay URL to naughty entry | ||
| 76 | entries: DashMap<String, NaughtyEntry>, | ||
| 77 | /// How many hours before removing a relay from the naughty list | ||
| 78 | expiration_hours: u64, | ||
| 79 | } | ||
| 80 | |||
| 81 | impl NaughtyListTracker { | ||
| 82 | /// Create a new NaughtyListTracker with the specified expiration time | ||
| 83 | /// | ||
| 84 | /// # Arguments | ||
| 85 | /// | ||
| 86 | /// * `expiration_hours` - Hours before a naughty entry expires (default: 12) | ||
| 87 | pub fn new(expiration_hours: u64) -> Self { | ||
| 88 | Self { | ||
| 89 | entries: DashMap::new(), | ||
| 90 | expiration_hours, | ||
| 91 | } | ||
| 92 | } | ||
| 93 | |||
| 94 | /// Create a new NaughtyListTracker with default 12-hour expiration | ||
| 95 | pub fn with_defaults() -> Self { | ||
| 96 | Self::new(12) | ||
| 97 | } | ||
| 98 | |||
| 99 | /// Classify an error string into a naughty category or return None for transient errors | ||
| 100 | /// | ||
| 101 | /// # Arguments | ||
| 102 | /// | ||
| 103 | /// * `error` - The error message string to classify | ||
| 104 | /// | ||
| 105 | /// # Returns | ||
| 106 | /// | ||
| 107 | /// - `Some(NaughtyCategory)` if the error indicates a persistent infrastructure issue | ||
| 108 | /// - `None` if the error is a transient network issue (use HealthTracker backoff) | ||
| 109 | pub fn classify_error(error: &str) -> Option<NaughtyCategory> { | ||
| 110 | let error_lower = error.to_lowercase(); | ||
| 111 | |||
| 112 | // DNS lookup failures | ||
| 113 | if error_lower.contains("failed to lookup address") | ||
| 114 | || error_lower.contains("name or service not known") | ||
| 115 | || error_lower.contains("nodename nor servname provided") | ||
| 116 | || (error_lower.contains("dns") && !error_lower.contains("timeout")) | ||
| 117 | { | ||
| 118 | return Some(NaughtyCategory::DnsLookupFailed); | ||
| 119 | } | ||
| 120 | |||
| 121 | // TLS certificate errors | ||
| 122 | if error_lower.contains("certificate") | ||
| 123 | || error_lower.contains("ssl") | ||
| 124 | || error_lower.contains("tls") | ||
| 125 | { | ||
| 126 | // Exclude timeout errors that mention TLS | ||
| 127 | if !error_lower.contains("timeout") && !error_lower.contains("timed out") { | ||
| 128 | return Some(NaughtyCategory::TlsCertificateInvalid); | ||
| 129 | } | ||
| 130 | } | ||
| 131 | |||
| 132 | // Protocol errors | ||
| 133 | if error_lower.contains("websocket") | ||
| 134 | || error_lower.contains("protocol") | ||
| 135 | || error_lower.contains("invalid frame") | ||
| 136 | { | ||
| 137 | // Exclude connection errors | ||
| 138 | if !error_lower.contains("connection") | ||
| 139 | && !error_lower.contains("timeout") | ||
| 140 | && !error_lower.contains("refused") | ||
| 141 | { | ||
| 142 | return Some(NaughtyCategory::ProtocolError); | ||
| 143 | } | ||
| 144 | } | ||
| 145 | |||
| 146 | // Everything else is transient (timeouts, refused, etc.) | ||
| 147 | None | ||
| 148 | } | ||
| 149 | |||
| 150 | /// Record a naughty relay (adds new entry or updates existing) | ||
| 151 | /// | ||
| 152 | /// # Arguments | ||
| 153 | /// | ||
| 154 | /// * `relay_url` - The relay URL | ||
| 155 | /// * `category` - The naughty category | ||
| 156 | /// * `reason` - The full error message | ||
| 157 | /// | ||
| 158 | /// # Returns | ||
| 159 | /// | ||
| 160 | /// `true` if this is a new naughty entry (first occurrence), `false` if updating existing | ||
| 161 | pub fn record(&self, relay_url: &str, category: NaughtyCategory, reason: String) -> bool { | ||
| 162 | let now = Instant::now(); | ||
| 163 | |||
| 164 | if let Some(mut entry) = self.entries.get_mut(relay_url) { | ||
| 165 | // Update existing entry | ||
| 166 | entry.last_seen = now; | ||
| 167 | entry.occurrence_count = entry.occurrence_count.saturating_add(1); | ||
| 168 | entry.reason = reason; // Update with latest error message | ||
| 169 | false | ||
| 170 | } else { | ||
| 171 | // Create new entry | ||
| 172 | self.entries.insert( | ||
| 173 | relay_url.to_string(), | ||
| 174 | NaughtyEntry { | ||
| 175 | category, | ||
| 176 | reason, | ||
| 177 | first_seen: now, | ||
| 178 | last_seen: now, | ||
| 179 | occurrence_count: 1, | ||
| 180 | }, | ||
| 181 | ); | ||
| 182 | true | ||
| 183 | } | ||
| 184 | } | ||
| 185 | |||
| 186 | /// Check if a relay is on the naughty list (not expired) | ||
| 187 | /// | ||
| 188 | /// # Arguments | ||
| 189 | /// | ||
| 190 | /// * `relay_url` - The relay URL to check | ||
| 191 | /// | ||
| 192 | /// # Returns | ||
| 193 | /// | ||
| 194 | /// `true` if the relay is currently on the naughty list | ||
| 195 | pub fn is_naughty(&self, relay_url: &str) -> bool { | ||
| 196 | if let Some(entry) = self.entries.get(relay_url) { | ||
| 197 | let age = Instant::now().duration_since(entry.first_seen); | ||
| 198 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); | ||
| 199 | age < expiration | ||
| 200 | } else { | ||
| 201 | false | ||
| 202 | } | ||
| 203 | } | ||
| 204 | |||
| 205 | /// Get a naughty entry if it exists and hasn't expired | ||
| 206 | /// | ||
| 207 | /// # Arguments | ||
| 208 | /// | ||
| 209 | /// * `relay_url` - The relay URL to look up | ||
| 210 | /// | ||
| 211 | /// # Returns | ||
| 212 | /// | ||
| 213 | /// A cloned `NaughtyEntry` if the relay is on the naughty list and not expired | ||
| 214 | pub fn get_entry(&self, relay_url: &str) -> Option<NaughtyEntry> { | ||
| 215 | self.entries.get(relay_url).map(|e| e.clone()) | ||
| 216 | } | ||
| 217 | |||
| 218 | /// Remove expired entries from the naughty list | ||
| 219 | /// | ||
| 220 | /// Entries older than `expiration_hours` are removed to allow relays | ||
| 221 | /// to be retried after infrastructure issues are potentially fixed. | ||
| 222 | /// | ||
| 223 | /// # Returns | ||
| 224 | /// | ||
| 225 | /// Vector of relay URLs that were removed from the naughty list | ||
| 226 | pub fn expire_old_entries(&self) -> Vec<String> { | ||
| 227 | let now = Instant::now(); | ||
| 228 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); | ||
| 229 | let mut expired = Vec::new(); | ||
| 230 | |||
| 231 | // Collect expired relay URLs | ||
| 232 | self.entries.retain(|url, entry| { | ||
| 233 | let age = now.duration_since(entry.first_seen); | ||
| 234 | if age >= expiration { | ||
| 235 | expired.push(url.clone()); | ||
| 236 | false // Remove this entry | ||
| 237 | } else { | ||
| 238 | true // Keep this entry | ||
| 239 | } | ||
| 240 | }); | ||
| 241 | |||
| 242 | expired | ||
| 243 | } | ||
| 244 | |||
| 245 | /// Get all naughty relays (for metrics and monitoring) | ||
| 246 | /// | ||
| 247 | /// # Returns | ||
| 248 | /// | ||
| 249 | /// Vector of (relay_url, entry) tuples for all relays currently on the naughty list | ||
| 250 | pub fn get_all(&self) -> Vec<(String, NaughtyEntry)> { | ||
| 251 | self.entries | ||
| 252 | .iter() | ||
| 253 | .map(|entry| (entry.key().clone(), entry.value().clone())) | ||
| 254 | .collect() | ||
| 255 | } | ||
| 256 | |||
| 257 | /// Get the count of relays in a specific category | ||
| 258 | /// | ||
| 259 | /// # Arguments | ||
| 260 | /// | ||
| 261 | /// * `category` - The category to count | ||
| 262 | /// | ||
| 263 | /// # Returns | ||
| 264 | /// | ||
| 265 | /// Number of relays in the specified category | ||
| 266 | pub fn count_by_category(&self, category: NaughtyCategory) -> usize { | ||
| 267 | self.entries | ||
| 268 | .iter() | ||
| 269 | .filter(|entry| entry.value().category == category) | ||
| 270 | .count() | ||
| 271 | } | ||
| 272 | |||
| 273 | /// Get total number of relays on the naughty list | ||
| 274 | pub fn total_count(&self) -> usize { | ||
| 275 | self.entries.len() | ||
| 276 | } | ||
| 277 | } | ||
| 278 | |||
| 279 | #[cfg(test)] | ||
| 280 | mod tests { | ||
| 281 | use super::*; | ||
| 282 | |||
| 283 | #[test] | ||
| 284 | fn test_classify_dns_errors() { | ||
| 285 | assert_eq!( | ||
| 286 | NaughtyListTracker::classify_error("failed to lookup address information"), | ||
| 287 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 288 | ); | ||
| 289 | assert_eq!( | ||
| 290 | NaughtyListTracker::classify_error("Name or service not known"), | ||
| 291 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 292 | ); | ||
| 293 | assert_eq!( | ||
| 294 | NaughtyListTracker::classify_error("nodename nor servname provided"), | ||
| 295 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 296 | ); | ||
| 297 | assert_eq!( | ||
| 298 | NaughtyListTracker::classify_error("dns error: NXDOMAIN"), | ||
| 299 | Some(NaughtyCategory::DnsLookupFailed) | ||
| 300 | ); | ||
| 301 | } | ||
| 302 | |||
| 303 | #[test] | ||
| 304 | fn test_classify_tls_errors() { | ||
| 305 | assert_eq!( | ||
| 306 | NaughtyListTracker::classify_error("certificate not valid for 'example.com'"), | ||
| 307 | Some(NaughtyCategory::TlsCertificateInvalid) | ||
| 308 | ); | ||
| 309 | assert_eq!( | ||
| 310 | NaughtyListTracker::classify_error("SSL certificate problem"), | ||
| 311 | Some(NaughtyCategory::TlsCertificateInvalid) | ||
| 312 | ); | ||
| 313 | assert_eq!( | ||
| 314 | NaughtyListTracker::classify_error("TLS handshake failed"), | ||
| 315 | Some(NaughtyCategory::TlsCertificateInvalid) | ||
| 316 | ); | ||
| 317 | |||
| 318 | // TLS timeout should NOT be classified as naughty | ||
| 319 | assert_eq!( | ||
| 320 | NaughtyListTracker::classify_error("TLS connection timed out"), | ||
| 321 | None | ||
| 322 | ); | ||
| 323 | } | ||
| 324 | |||
| 325 | #[test] | ||
| 326 | fn test_classify_protocol_errors() { | ||
| 327 | assert_eq!( | ||
| 328 | NaughtyListTracker::classify_error("websocket protocol error"), | ||
| 329 | Some(NaughtyCategory::ProtocolError) | ||
| 330 | ); | ||
| 331 | assert_eq!( | ||
| 332 | NaughtyListTracker::classify_error("invalid frame header"), | ||
| 333 | Some(NaughtyCategory::ProtocolError) | ||
| 334 | ); | ||
| 335 | |||
| 336 | // WebSocket connection errors should NOT be classified as naughty | ||
| 337 | assert_eq!( | ||
| 338 | NaughtyListTracker::classify_error("websocket connection refused"), | ||
| 339 | None | ||
| 340 | ); | ||
| 341 | } | ||
| 342 | |||
| 343 | #[test] | ||
| 344 | fn test_classify_transient_errors() { | ||
| 345 | // Timeouts are transient | ||
| 346 | assert_eq!( | ||
| 347 | NaughtyListTracker::classify_error("connection timed out"), | ||
| 348 | None | ||
| 349 | ); | ||
| 350 | assert_eq!( | ||
| 351 | NaughtyListTracker::classify_error("operation timed out"), | ||
| 352 | None | ||
| 353 | ); | ||
| 354 | |||
| 355 | // Connection refused is transient | ||
| 356 | assert_eq!( | ||
| 357 | NaughtyListTracker::classify_error("connection refused"), | ||
| 358 | None | ||
| 359 | ); | ||
| 360 | |||
| 361 | // Generic network errors are transient | ||
| 362 | assert_eq!( | ||
| 363 | NaughtyListTracker::classify_error("network unreachable"), | ||
| 364 | None | ||
| 365 | ); | ||
| 366 | } | ||
| 367 | |||
| 368 | #[test] | ||
| 369 | fn test_record_new_entry() { | ||
| 370 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 371 | let url = "wss://bad-relay.example.com"; | ||
| 372 | |||
| 373 | let is_new = tracker.record( | ||
| 374 | url, | ||
| 375 | NaughtyCategory::DnsLookupFailed, | ||
| 376 | "failed to lookup address".to_string(), | ||
| 377 | ); | ||
| 378 | |||
| 379 | assert!(is_new); | ||
| 380 | assert!(tracker.is_naughty(url)); | ||
| 381 | |||
| 382 | let entry = tracker.get_entry(url).unwrap(); | ||
| 383 | assert_eq!(entry.category, NaughtyCategory::DnsLookupFailed); | ||
| 384 | assert_eq!(entry.occurrence_count, 1); | ||
| 385 | } | ||
| 386 | |||
| 387 | #[test] | ||
| 388 | fn test_record_updates_existing() { | ||
| 389 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 390 | let url = "wss://bad-relay.example.com"; | ||
| 391 | |||
| 392 | // First occurrence | ||
| 393 | let is_new1 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 1".to_string()); | ||
| 394 | assert!(is_new1); | ||
| 395 | |||
| 396 | // Second occurrence | ||
| 397 | let is_new2 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 2".to_string()); | ||
| 398 | assert!(!is_new2); | ||
| 399 | |||
| 400 | let entry = tracker.get_entry(url).unwrap(); | ||
| 401 | assert_eq!(entry.occurrence_count, 2); | ||
| 402 | assert_eq!(entry.reason, "error 2"); // Updated to latest | ||
| 403 | } | ||
| 404 | |||
| 405 | #[test] | ||
| 406 | fn test_is_naughty() { | ||
| 407 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 408 | let url = "wss://bad-relay.example.com"; | ||
| 409 | |||
| 410 | assert!(!tracker.is_naughty(url)); | ||
| 411 | |||
| 412 | tracker.record( | ||
| 413 | url, | ||
| 414 | NaughtyCategory::TlsCertificateInvalid, | ||
| 415 | "cert error".to_string(), | ||
| 416 | ); | ||
| 417 | |||
| 418 | assert!(tracker.is_naughty(url)); | ||
| 419 | } | ||
| 420 | |||
| 421 | #[test] | ||
| 422 | fn test_get_all() { | ||
| 423 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 424 | |||
| 425 | tracker.record( | ||
| 426 | "wss://relay1.example.com", | ||
| 427 | NaughtyCategory::DnsLookupFailed, | ||
| 428 | "dns error".to_string(), | ||
| 429 | ); | ||
| 430 | tracker.record( | ||
| 431 | "wss://relay2.example.com", | ||
| 432 | NaughtyCategory::TlsCertificateInvalid, | ||
| 433 | "tls error".to_string(), | ||
| 434 | ); | ||
| 435 | |||
| 436 | let all = tracker.get_all(); | ||
| 437 | assert_eq!(all.len(), 2); | ||
| 438 | } | ||
| 439 | |||
| 440 | #[test] | ||
| 441 | fn test_count_by_category() { | ||
| 442 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 443 | |||
| 444 | tracker.record( | ||
| 445 | "wss://relay1.example.com", | ||
| 446 | NaughtyCategory::DnsLookupFailed, | ||
| 447 | "error".to_string(), | ||
| 448 | ); | ||
| 449 | tracker.record( | ||
| 450 | "wss://relay2.example.com", | ||
| 451 | NaughtyCategory::DnsLookupFailed, | ||
| 452 | "error".to_string(), | ||
| 453 | ); | ||
| 454 | tracker.record( | ||
| 455 | "wss://relay3.example.com", | ||
| 456 | NaughtyCategory::TlsCertificateInvalid, | ||
| 457 | "error".to_string(), | ||
| 458 | ); | ||
| 459 | |||
| 460 | assert_eq!( | ||
| 461 | tracker.count_by_category(NaughtyCategory::DnsLookupFailed), | ||
| 462 | 2 | ||
| 463 | ); | ||
| 464 | assert_eq!( | ||
| 465 | tracker.count_by_category(NaughtyCategory::TlsCertificateInvalid), | ||
| 466 | 1 | ||
| 467 | ); | ||
| 468 | assert_eq!(tracker.count_by_category(NaughtyCategory::ProtocolError), 0); | ||
| 469 | } | ||
| 470 | |||
| 471 | #[test] | ||
| 472 | fn test_total_count() { | ||
| 473 | let tracker = NaughtyListTracker::with_defaults(); | ||
| 474 | assert_eq!(tracker.total_count(), 0); | ||
| 475 | |||
| 476 | tracker.record( | ||
| 477 | "wss://relay1.example.com", | ||
| 478 | NaughtyCategory::DnsLookupFailed, | ||
| 479 | "error".to_string(), | ||
| 480 | ); | ||
| 481 | assert_eq!(tracker.total_count(), 1); | ||
| 482 | |||
| 483 | tracker.record( | ||
| 484 | "wss://relay2.example.com", | ||
| 485 | NaughtyCategory::TlsCertificateInvalid, | ||
| 486 | "error".to_string(), | ||
| 487 | ); | ||
| 488 | assert_eq!(tracker.total_count(), 2); | ||
| 489 | } | ||
| 490 | |||
| 491 | #[test] | ||
| 492 | fn test_expire_old_entries() { | ||
| 493 | // Use very short expiration for testing | ||
| 494 | let tracker = NaughtyListTracker::new(0); // Expire immediately (0 hours) | ||
| 495 | |||
| 496 | tracker.record( | ||
| 497 | "wss://relay1.example.com", | ||
| 498 | NaughtyCategory::DnsLookupFailed, | ||
| 499 | "error".to_string(), | ||
| 500 | ); | ||
| 501 | |||
| 502 | // Entry should exist in the map | ||
| 503 | assert_eq!(tracker.total_count(), 1); | ||
| 504 | |||
| 505 | // But is_naughty should return false since it's already expired (0 hours) | ||
| 506 | assert!(!tracker.is_naughty("wss://relay1.example.com")); | ||
| 507 | |||
| 508 | // Sleep to ensure time passes | ||
| 509 | std::thread::sleep(std::time::Duration::from_millis(10)); | ||
| 510 | |||
| 511 | // Expire old entries (should remove the 0-hour expired entry) | ||
| 512 | let expired = tracker.expire_old_entries(); | ||
| 513 | assert_eq!(expired.len(), 1); | ||
| 514 | assert_eq!(expired[0], "wss://relay1.example.com"); | ||
| 515 | |||
| 516 | // Entry should be gone | ||
| 517 | assert!(!tracker.is_naughty("wss://relay1.example.com")); | ||
| 518 | assert_eq!(tracker.total_count(), 0); | ||
| 519 | } | ||
| 520 | |||
| 521 | #[test] | ||
| 522 | fn test_category_display() { | ||
| 523 | assert_eq!( | ||
| 524 | NaughtyCategory::DnsLookupFailed.to_string(), | ||
| 525 | "dns_lookup_failed" | ||
| 526 | ); | ||
| 527 | assert_eq!( | ||
| 528 | NaughtyCategory::TlsCertificateInvalid.to_string(), | ||
| 529 | "tls_certificate_invalid" | ||
| 530 | ); | ||
| 531 | assert_eq!(NaughtyCategory::ProtocolError.to_string(), "protocol_error"); | ||
| 532 | } | ||
| 533 | |||
| 534 | #[test] | ||
| 535 | fn test_category_as_str() { | ||
| 536 | assert_eq!( | ||
| 537 | NaughtyCategory::DnsLookupFailed.as_str(), | ||
| 538 | "dns_lookup_failed" | ||
| 539 | ); | ||
| 540 | assert_eq!( | ||
| 541 | NaughtyCategory::TlsCertificateInvalid.as_str(), | ||
| 542 | "tls_certificate_invalid" | ||
| 543 | ); | ||
| 544 | assert_eq!(NaughtyCategory::ProtocolError.as_str(), "protocol_error"); | ||
| 545 | } | ||
| 546 | } | ||