diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 01:24:52 +0000 |
| commit | c9b3b3bd8a04de139bcb0d0b83bf819c367ee8c8 (patch) | |
| tree | 80cc2c1ec92d71637408a7588d8cb908f03fc4b6 /src/sync/health.rs | |
| parent | 9369a2885f5a3f9e38c0a3f9fa3af6260513c8e4 (diff) | |
Implement relay naughty list feature
Add naughty list tracking for relays with persistent infrastructure issues
(DNS failures, TLS certificate errors, protocol violations) to reduce log
noise and provide better visibility via metrics.
Key features:
- Classify errors into naughty (persistent) vs transient (temporary)
- Track naughty relays with category, reason, and occurrence count
- Log WARN on first naughty occurrence, DEBUG on repeats
- Automatic expiration after 12 hours (configurable)
- Prometheus metrics for monitoring naughty relays by category
- Periodic cleanup task integrated with health checker
Components added:
- src/sync/naughty_list.rs: Core naughty list tracker with error classification
- NaughtyListTracker integration in RelayHealthTracker
- Connection error handling updates in sync manager
- Naughty list metrics (total by category, detailed info per relay)
- Config option for naughty_list_expiration_hours (default: 12)
Closes DNS lookup failures and TLS certificate errors tracking issues.
Diffstat (limited to 'src/sync/health.rs')
| -rw-r--r-- | src/sync/health.rs | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/src/sync/health.rs b/src/sync/health.rs index 2948707..833918b 100644 --- a/src/sync/health.rs +++ b/src/sync/health.rs | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | //! - Exponential backoff with configurable max delay | 5 | //! - Exponential backoff with configurable max delay |
| 6 | //! - Dead relay detection after 24h of continuous failures | 6 | //! - Dead relay detection after 24h of continuous failures |
| 7 | //! - Rate limit detection and fixed cooldown period | 7 | //! - Rate limit detection and fixed cooldown period |
| 8 | //! - Naughty list for persistent infrastructure issues (DNS, TLS, protocol errors) | ||
| 8 | //! | 9 | //! |
| 9 | //! ## Health States | 10 | //! ## Health States |
| 10 | //! | 11 | //! |
| @@ -18,6 +19,7 @@ use std::time::{Duration, Instant}; | |||
| 18 | 19 | ||
| 19 | use dashmap::DashMap; | 20 | use dashmap::DashMap; |
| 20 | 21 | ||
| 22 | use super::naughty_list::NaughtyListTracker; | ||
| 21 | use crate::config::Config; | 23 | use crate::config::Config; |
| 22 | 24 | ||
| 23 | /// Duration threshold before a relay is considered dead (24 hours) | 25 | /// Duration threshold before a relay is considered dead (24 hours) |
| @@ -213,15 +215,21 @@ pub struct RelayHealthTracker { | |||
| 213 | health: DashMap<String, RelayHealth>, | 215 | health: DashMap<String, RelayHealth>, |
| 214 | max_backoff_secs: u64, | 216 | max_backoff_secs: u64, |
| 215 | base_backoff_secs: u64, | 217 | base_backoff_secs: u64, |
| 218 | naughty_list: Option<Arc<NaughtyListTracker>>, | ||
| 216 | } | 219 | } |
| 217 | 220 | ||
| 218 | impl RelayHealthTracker { | 221 | impl RelayHealthTracker { |
| 219 | /// Create a new RelayHealthTracker | 222 | /// Create a new RelayHealthTracker |
| 220 | pub fn new(config: &Config) -> Self { | 223 | pub fn new(config: &Config) -> Self { |
| 224 | let naughty_list = Some(Arc::new(NaughtyListTracker::new( | ||
| 225 | config.naughty_list_expiration_hours, | ||
| 226 | ))); | ||
| 227 | |||
| 221 | Self { | 228 | Self { |
| 222 | health: DashMap::new(), | 229 | health: DashMap::new(), |
| 223 | max_backoff_secs: config.sync_max_backoff_secs, | 230 | max_backoff_secs: config.sync_max_backoff_secs, |
| 224 | base_backoff_secs: config.sync_base_backoff_secs, | 231 | base_backoff_secs: config.sync_base_backoff_secs, |
| 232 | naughty_list, | ||
| 225 | } | 233 | } |
| 226 | } | 234 | } |
| 227 | 235 | ||
| @@ -231,6 +239,7 @@ impl RelayHealthTracker { | |||
| 231 | health: DashMap::new(), | 239 | health: DashMap::new(), |
| 232 | max_backoff_secs: DEFAULT_MAX_BACKOFF_SECS, | 240 | max_backoff_secs: DEFAULT_MAX_BACKOFF_SECS, |
| 233 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, | 241 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, |
| 242 | naughty_list: Some(Arc::new(NaughtyListTracker::with_defaults())), | ||
| 234 | } | 243 | } |
| 235 | } | 244 | } |
| 236 | 245 | ||
| @@ -240,6 +249,7 @@ impl RelayHealthTracker { | |||
| 240 | health: DashMap::new(), | 249 | health: DashMap::new(), |
| 241 | max_backoff_secs, | 250 | max_backoff_secs, |
| 242 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, | 251 | base_backoff_secs: DEFAULT_BASE_BACKOFF_SECS, |
| 252 | naughty_list: Some(Arc::new(NaughtyListTracker::with_defaults())), | ||
| 243 | } | 253 | } |
| 244 | } | 254 | } |
| 245 | 255 | ||
| @@ -549,6 +559,11 @@ impl RelayHealthTracker { | |||
| 549 | .get(relay_url) | 559 | .get(relay_url) |
| 550 | .map(|entry| entry.value().clone()) | 560 | .map(|entry| entry.value().clone()) |
| 551 | } | 561 | } |
| 562 | |||
| 563 | /// Get a reference to the naughty list tracker | ||
| 564 | pub fn naughty_list(&self) -> Option<Arc<NaughtyListTracker>> { | ||
| 565 | self.naughty_list.clone() | ||
| 566 | } | ||
| 552 | } | 567 | } |
| 553 | 568 | ||
| 554 | /// Create a shared RelayHealthTracker wrapped in Arc | 569 | /// Create a shared RelayHealthTracker wrapped in Arc |