diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
| commit | c9b3b3bd8a04de139bcb0d0b83bf819c367ee8c8 (patch) | |
| tree | 80cc2c1ec92d71637408a7588d8cb908f03fc4b6 /src/sync/mod.rs | |
| parent | 9369a2885f5a3f9e38c0a3f9fa3af6260513c8e4 (diff) | |
Implement relay naughty list feature
Add naughty list tracking for relays with persistent infrastructure issues
(DNS failures, TLS certificate errors, protocol violations) to reduce log
noise and provide better visibility via metrics.
Key features:
- Classify errors into naughty (persistent) vs transient (temporary)
- Track naughty relays with category, reason, and occurrence count
- Log WARN on first naughty occurrence, DEBUG on repeats
- Automatic expiration after 12 hours (configurable)
- Prometheus metrics for monitoring naughty relays by category
- Periodic cleanup task integrated with health checker
Components added:
- src/sync/naughty_list.rs: Core naughty list tracker with error classification
- NaughtyListTracker integration in RelayHealthTracker
- Connection error handling updates in sync manager
- Naughty list metrics (total by category, detailed info per relay)
- Config option for naughty_list_expiration_hours (default: 12)
Closes DNS lookup failures and TLS certificate errors tracking issues.
Diffstat (limited to 'src/sync/mod.rs')
| -rw-r--r-- | src/sync/mod.rs | 53 |
1 files changed, 51 insertions, 2 deletions
diff --git a/src/sync/mod.rs b/src/sync/mod.rs index 412cd16..8b51fac 100644 --- a/src/sync/mod.rs +++ b/src/sync/mod.rs | |||
| @@ -16,6 +16,7 @@ pub mod algorithms; | |||
| 16 | pub mod filters; | 16 | pub mod filters; |
| 17 | pub mod health; | 17 | pub mod health; |
| 18 | pub mod metrics; | 18 | pub mod metrics; |
| 19 | pub mod naughty_list; | ||
| 19 | pub mod rejected_index; | 20 | pub mod rejected_index; |
| 20 | pub mod relay_connection; | 21 | pub mod relay_connection; |
| 21 | pub mod self_subscriber; | 22 | pub mod self_subscriber; |
| @@ -483,7 +484,18 @@ async fn run_health_and_metrics_checker( | |||
| 483 | // 2. Check for rate limit recovery | 484 | // 2. Check for rate limit recovery |
| 484 | manager.check_rate_limit_recovery().await; | 485 | manager.check_rate_limit_recovery().await; |
| 485 | 486 | ||
| 486 | // 3. Update metrics with current health states | 487 | // 3. Check for naughty list expiration |
| 488 | if let Some(naughty_list) = manager.health_tracker.naughty_list() { | ||
| 489 | let recovered = naughty_list.expire_old_entries(); | ||
| 490 | for url in recovered { | ||
| 491 | tracing::info!( | ||
| 492 | relay = %url, | ||
| 493 | "Relay removed from naughty list after expiration, will retry" | ||
| 494 | ); | ||
| 495 | } | ||
| 496 | } | ||
| 497 | |||
| 498 | // 4. Update metrics with current health states and naughty list | ||
| 487 | if let Some(ref metrics) = manager.metrics { | 499 | if let Some(ref metrics) = manager.metrics { |
| 488 | // Get all tracked relay URLs | 500 | // Get all tracked relay URLs |
| 489 | let relay_urls: Vec<String> = { | 501 | let relay_urls: Vec<String> = { |
| @@ -496,6 +508,12 @@ async fn run_health_and_metrics_checker( | |||
| 496 | let state = manager.health_tracker.get_state(&relay_url); | 508 | let state = manager.health_tracker.get_state(&relay_url); |
| 497 | metrics.record_health_state(&relay_url, state); | 509 | metrics.record_health_state(&relay_url, state); |
| 498 | } | 510 | } |
| 511 | |||
| 512 | // Update naughty list metrics | ||
| 513 | if let Some(naughty_list) = manager.health_tracker.naughty_list() { | ||
| 514 | let entries = naughty_list.get_all(); | ||
| 515 | metrics.update_naughty_list(entries); | ||
| 516 | } | ||
| 499 | } | 517 | } |
| 500 | } | 518 | } |
| 501 | _ = shutdown_rx.recv() => { | 519 | _ = shutdown_rx.recv() => { |
| @@ -2018,7 +2036,38 @@ impl SyncManager { | |||
| 2018 | } | 2036 | } |
| 2019 | } | 2037 | } |
| 2020 | Err(e) => { | 2038 | Err(e) => { |
| 2021 | tracing::error!(relay = %relay_url, error = %e, "Connection failed"); | 2039 | // Classify error to determine if it's a naughty relay or transient issue |
| 2040 | let error_str = e.to_string(); | ||
| 2041 | |||
| 2042 | if let Some(category) = naughty_list::NaughtyListTracker::classify_error(&error_str) | ||
| 2043 | { | ||
| 2044 | // Persistent infrastructure issue - use naughty list | ||
| 2045 | if let Some(ref naughty_list) = self.health_tracker.naughty_list() { | ||
| 2046 | let is_new = naughty_list.record(relay_url, category, error_str.clone()); | ||
| 2047 | |||
| 2048 | if is_new { | ||
| 2049 | tracing::warn!( | ||
| 2050 | relay = %relay_url, | ||
| 2051 | category = ?category, | ||
| 2052 | error = %e, | ||
| 2053 | "Relay has persistent configuration issue, added to naughty list" | ||
| 2054 | ); | ||
| 2055 | } else { | ||
| 2056 | tracing::debug!( | ||
| 2057 | relay = %relay_url, | ||
| 2058 | category = ?category, | ||
| 2059 | "Naughty relay failure (already tracked)" | ||
| 2060 | ); | ||
| 2061 | } | ||
| 2062 | } | ||
| 2063 | } else { | ||
| 2064 | // Transient network issue - use existing backoff flow | ||
| 2065 | tracing::debug!( | ||
| 2066 | relay = %relay_url, | ||
| 2067 | error = %e, | ||
| 2068 | "Connection failed (transient issue, backoff active)" | ||
| 2069 | ); | ||
| 2070 | } | ||
| 2022 | 2071 | ||
| 2023 | // 4. Update state back to Disconnected on failure | 2072 | // 4. Update state back to Disconnected on failure |
| 2024 | { | 2073 | { |