upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/src/sync/naughty_list.rs
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-10 01:24:52 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-10 01:24:52 +0000
commitc9b3b3bd8a04de139bcb0d0b83bf819c367ee8c8 (patch)
tree80cc2c1ec92d71637408a7588d8cb908f03fc4b6 /src/sync/naughty_list.rs
parent9369a2885f5a3f9e38c0a3f9fa3af6260513c8e4 (diff)
Implement relay naughty list feature
Add naughty list tracking for relays with persistent infrastructure issues (DNS failures, TLS certificate errors, protocol violations) to reduce log noise and provide better visibility via metrics. Key features: - Classify errors into naughty (persistent) vs transient (temporary) - Track naughty relays with category, reason, and occurrence count - Log WARN on first naughty occurrence, DEBUG on repeats - Automatic expiration after 12 hours (configurable) - Prometheus metrics for monitoring naughty relays by category - Periodic cleanup task integrated with health checker Components added: - src/sync/naughty_list.rs: Core naughty list tracker with error classification - NaughtyListTracker integration in RelayHealthTracker - Connection error handling updates in sync manager - Naughty list metrics (total by category, detailed info per relay) - Config option for naughty_list_expiration_hours (default: 12) Closes DNS lookup failures and TLS certificate errors tracking issues.
Diffstat (limited to 'src/sync/naughty_list.rs')
-rw-r--r--src/sync/naughty_list.rs546
1 files changed, 546 insertions, 0 deletions
diff --git a/src/sync/naughty_list.rs b/src/sync/naughty_list.rs
new file mode 100644
index 0000000..311b9bb
--- /dev/null
+++ b/src/sync/naughty_list.rs
@@ -0,0 +1,546 @@
1//! Naughty List Tracker for Relays with Persistent Infrastructure Issues
2//!
3//! This module tracks relays with persistent configuration/infrastructure problems
4//! (DNS failures, TLS certificate errors, protocol violations) separately from
5//! transient network issues (timeouts, connection refused).
6//!
7//! ## Failure Classification
8//!
9//! **Naughty List (12-hour expiration, log WARN on first occurrence, DEBUG on repeat):**
10//! - `DnsLookupFailed`: Domain doesn't resolve or DNS errors
11//! - `TlsCertificateInvalid`: Certificate errors (expired, mismatch, self-signed)
12//! - `ProtocolError`: WebSocket/Nostr protocol violations
13//!
14//! **NOT Naughty (use existing HealthTracker backoff):**
15//! - Connection timeouts (could be network congestion)
16//! - Connection refused (could be temporary maintenance)
17//!
18//! ## Automatic Expiration
19//!
20//! Entries expire after 12 hours (configurable) to allow relays to recover from
21//! infrastructure issues. After expiration, the relay is automatically retried.
22
23use dashmap::DashMap;
24use std::time::Instant;
25
26/// Category of persistent relay failure that qualifies for the naughty list
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum NaughtyCategory {
29 /// DNS lookup failures (domain doesn't resolve)
30 DnsLookupFailed,
31 /// TLS certificate errors (expired, invalid, mismatch)
32 TlsCertificateInvalid,
33 /// WebSocket or Nostr protocol violations
34 ProtocolError,
35}
36
37impl NaughtyCategory {
38 /// Get string representation for metrics labels
39 pub fn as_str(&self) -> &'static str {
40 match self {
41 NaughtyCategory::DnsLookupFailed => "dns_lookup_failed",
42 NaughtyCategory::TlsCertificateInvalid => "tls_certificate_invalid",
43 NaughtyCategory::ProtocolError => "protocol_error",
44 }
45 }
46}
47
48impl std::fmt::Display for NaughtyCategory {
49 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50 write!(f, "{}", self.as_str())
51 }
52}
53
54/// Naughty list entry for a relay with persistent issues
55#[derive(Debug, Clone)]
56pub struct NaughtyEntry {
57 /// Category of the persistent failure
58 pub category: NaughtyCategory,
59 /// Full error message
60 pub reason: String,
61 /// When this relay was first added to the naughty list
62 pub first_seen: Instant,
63 /// Most recent occurrence of the issue
64 pub last_seen: Instant,
65 /// Number of times we've seen this issue
66 pub occurrence_count: u32,
67}
68
69/// Tracks relays with persistent infrastructure/configuration issues
70///
71/// Separate from HealthTracker's backoff logic - this is specifically for
72/// relays with configuration problems that are unlikely to be fixed quickly.
73#[derive(Debug)]
74pub struct NaughtyListTracker {
75 /// Map of relay URL to naughty entry
76 entries: DashMap<String, NaughtyEntry>,
77 /// How many hours before removing a relay from the naughty list
78 expiration_hours: u64,
79}
80
81impl NaughtyListTracker {
82 /// Create a new NaughtyListTracker with the specified expiration time
83 ///
84 /// # Arguments
85 ///
86 /// * `expiration_hours` - Hours before a naughty entry expires (default: 12)
87 pub fn new(expiration_hours: u64) -> Self {
88 Self {
89 entries: DashMap::new(),
90 expiration_hours,
91 }
92 }
93
94 /// Create a new NaughtyListTracker with default 12-hour expiration
95 pub fn with_defaults() -> Self {
96 Self::new(12)
97 }
98
99 /// Classify an error string into a naughty category or return None for transient errors
100 ///
101 /// # Arguments
102 ///
103 /// * `error` - The error message string to classify
104 ///
105 /// # Returns
106 ///
107 /// - `Some(NaughtyCategory)` if the error indicates a persistent infrastructure issue
108 /// - `None` if the error is a transient network issue (use HealthTracker backoff)
109 pub fn classify_error(error: &str) -> Option<NaughtyCategory> {
110 let error_lower = error.to_lowercase();
111
112 // DNS lookup failures
113 if error_lower.contains("failed to lookup address")
114 || error_lower.contains("name or service not known")
115 || error_lower.contains("nodename nor servname provided")
116 || (error_lower.contains("dns") && !error_lower.contains("timeout"))
117 {
118 return Some(NaughtyCategory::DnsLookupFailed);
119 }
120
121 // TLS certificate errors
122 if error_lower.contains("certificate")
123 || error_lower.contains("ssl")
124 || error_lower.contains("tls")
125 {
126 // Exclude timeout errors that mention TLS
127 if !error_lower.contains("timeout") && !error_lower.contains("timed out") {
128 return Some(NaughtyCategory::TlsCertificateInvalid);
129 }
130 }
131
132 // Protocol errors
133 if error_lower.contains("websocket")
134 || error_lower.contains("protocol")
135 || error_lower.contains("invalid frame")
136 {
137 // Exclude connection errors
138 if !error_lower.contains("connection")
139 && !error_lower.contains("timeout")
140 && !error_lower.contains("refused")
141 {
142 return Some(NaughtyCategory::ProtocolError);
143 }
144 }
145
146 // Everything else is transient (timeouts, refused, etc.)
147 None
148 }
149
150 /// Record a naughty relay (adds new entry or updates existing)
151 ///
152 /// # Arguments
153 ///
154 /// * `relay_url` - The relay URL
155 /// * `category` - The naughty category
156 /// * `reason` - The full error message
157 ///
158 /// # Returns
159 ///
160 /// `true` if this is a new naughty entry (first occurrence), `false` if updating existing
161 pub fn record(&self, relay_url: &str, category: NaughtyCategory, reason: String) -> bool {
162 let now = Instant::now();
163
164 if let Some(mut entry) = self.entries.get_mut(relay_url) {
165 // Update existing entry
166 entry.last_seen = now;
167 entry.occurrence_count = entry.occurrence_count.saturating_add(1);
168 entry.reason = reason; // Update with latest error message
169 false
170 } else {
171 // Create new entry
172 self.entries.insert(
173 relay_url.to_string(),
174 NaughtyEntry {
175 category,
176 reason,
177 first_seen: now,
178 last_seen: now,
179 occurrence_count: 1,
180 },
181 );
182 true
183 }
184 }
185
186 /// Check if a relay is on the naughty list (not expired)
187 ///
188 /// # Arguments
189 ///
190 /// * `relay_url` - The relay URL to check
191 ///
192 /// # Returns
193 ///
194 /// `true` if the relay is currently on the naughty list
195 pub fn is_naughty(&self, relay_url: &str) -> bool {
196 if let Some(entry) = self.entries.get(relay_url) {
197 let age = Instant::now().duration_since(entry.first_seen);
198 let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600);
199 age < expiration
200 } else {
201 false
202 }
203 }
204
205 /// Get a naughty entry if it exists and hasn't expired
206 ///
207 /// # Arguments
208 ///
209 /// * `relay_url` - The relay URL to look up
210 ///
211 /// # Returns
212 ///
213 /// A cloned `NaughtyEntry` if the relay is on the naughty list and not expired
214 pub fn get_entry(&self, relay_url: &str) -> Option<NaughtyEntry> {
215 self.entries.get(relay_url).map(|e| e.clone())
216 }
217
218 /// Remove expired entries from the naughty list
219 ///
220 /// Entries older than `expiration_hours` are removed to allow relays
221 /// to be retried after infrastructure issues are potentially fixed.
222 ///
223 /// # Returns
224 ///
225 /// Vector of relay URLs that were removed from the naughty list
226 pub fn expire_old_entries(&self) -> Vec<String> {
227 let now = Instant::now();
228 let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600);
229 let mut expired = Vec::new();
230
231 // Collect expired relay URLs
232 self.entries.retain(|url, entry| {
233 let age = now.duration_since(entry.first_seen);
234 if age >= expiration {
235 expired.push(url.clone());
236 false // Remove this entry
237 } else {
238 true // Keep this entry
239 }
240 });
241
242 expired
243 }
244
245 /// Get all naughty relays (for metrics and monitoring)
246 ///
247 /// # Returns
248 ///
249 /// Vector of (relay_url, entry) tuples for all relays currently on the naughty list
250 pub fn get_all(&self) -> Vec<(String, NaughtyEntry)> {
251 self.entries
252 .iter()
253 .map(|entry| (entry.key().clone(), entry.value().clone()))
254 .collect()
255 }
256
257 /// Get the count of relays in a specific category
258 ///
259 /// # Arguments
260 ///
261 /// * `category` - The category to count
262 ///
263 /// # Returns
264 ///
265 /// Number of relays in the specified category
266 pub fn count_by_category(&self, category: NaughtyCategory) -> usize {
267 self.entries
268 .iter()
269 .filter(|entry| entry.value().category == category)
270 .count()
271 }
272
273 /// Get total number of relays on the naughty list
274 pub fn total_count(&self) -> usize {
275 self.entries.len()
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 #[test]
284 fn test_classify_dns_errors() {
285 assert_eq!(
286 NaughtyListTracker::classify_error("failed to lookup address information"),
287 Some(NaughtyCategory::DnsLookupFailed)
288 );
289 assert_eq!(
290 NaughtyListTracker::classify_error("Name or service not known"),
291 Some(NaughtyCategory::DnsLookupFailed)
292 );
293 assert_eq!(
294 NaughtyListTracker::classify_error("nodename nor servname provided"),
295 Some(NaughtyCategory::DnsLookupFailed)
296 );
297 assert_eq!(
298 NaughtyListTracker::classify_error("dns error: NXDOMAIN"),
299 Some(NaughtyCategory::DnsLookupFailed)
300 );
301 }
302
303 #[test]
304 fn test_classify_tls_errors() {
305 assert_eq!(
306 NaughtyListTracker::classify_error("certificate not valid for 'example.com'"),
307 Some(NaughtyCategory::TlsCertificateInvalid)
308 );
309 assert_eq!(
310 NaughtyListTracker::classify_error("SSL certificate problem"),
311 Some(NaughtyCategory::TlsCertificateInvalid)
312 );
313 assert_eq!(
314 NaughtyListTracker::classify_error("TLS handshake failed"),
315 Some(NaughtyCategory::TlsCertificateInvalid)
316 );
317
318 // TLS timeout should NOT be classified as naughty
319 assert_eq!(
320 NaughtyListTracker::classify_error("TLS connection timed out"),
321 None
322 );
323 }
324
325 #[test]
326 fn test_classify_protocol_errors() {
327 assert_eq!(
328 NaughtyListTracker::classify_error("websocket protocol error"),
329 Some(NaughtyCategory::ProtocolError)
330 );
331 assert_eq!(
332 NaughtyListTracker::classify_error("invalid frame header"),
333 Some(NaughtyCategory::ProtocolError)
334 );
335
336 // WebSocket connection errors should NOT be classified as naughty
337 assert_eq!(
338 NaughtyListTracker::classify_error("websocket connection refused"),
339 None
340 );
341 }
342
343 #[test]
344 fn test_classify_transient_errors() {
345 // Timeouts are transient
346 assert_eq!(
347 NaughtyListTracker::classify_error("connection timed out"),
348 None
349 );
350 assert_eq!(
351 NaughtyListTracker::classify_error("operation timed out"),
352 None
353 );
354
355 // Connection refused is transient
356 assert_eq!(
357 NaughtyListTracker::classify_error("connection refused"),
358 None
359 );
360
361 // Generic network errors are transient
362 assert_eq!(
363 NaughtyListTracker::classify_error("network unreachable"),
364 None
365 );
366 }
367
368 #[test]
369 fn test_record_new_entry() {
370 let tracker = NaughtyListTracker::with_defaults();
371 let url = "wss://bad-relay.example.com";
372
373 let is_new = tracker.record(
374 url,
375 NaughtyCategory::DnsLookupFailed,
376 "failed to lookup address".to_string(),
377 );
378
379 assert!(is_new);
380 assert!(tracker.is_naughty(url));
381
382 let entry = tracker.get_entry(url).unwrap();
383 assert_eq!(entry.category, NaughtyCategory::DnsLookupFailed);
384 assert_eq!(entry.occurrence_count, 1);
385 }
386
387 #[test]
388 fn test_record_updates_existing() {
389 let tracker = NaughtyListTracker::with_defaults();
390 let url = "wss://bad-relay.example.com";
391
392 // First occurrence
393 let is_new1 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 1".to_string());
394 assert!(is_new1);
395
396 // Second occurrence
397 let is_new2 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 2".to_string());
398 assert!(!is_new2);
399
400 let entry = tracker.get_entry(url).unwrap();
401 assert_eq!(entry.occurrence_count, 2);
402 assert_eq!(entry.reason, "error 2"); // Updated to latest
403 }
404
405 #[test]
406 fn test_is_naughty() {
407 let tracker = NaughtyListTracker::with_defaults();
408 let url = "wss://bad-relay.example.com";
409
410 assert!(!tracker.is_naughty(url));
411
412 tracker.record(
413 url,
414 NaughtyCategory::TlsCertificateInvalid,
415 "cert error".to_string(),
416 );
417
418 assert!(tracker.is_naughty(url));
419 }
420
421 #[test]
422 fn test_get_all() {
423 let tracker = NaughtyListTracker::with_defaults();
424
425 tracker.record(
426 "wss://relay1.example.com",
427 NaughtyCategory::DnsLookupFailed,
428 "dns error".to_string(),
429 );
430 tracker.record(
431 "wss://relay2.example.com",
432 NaughtyCategory::TlsCertificateInvalid,
433 "tls error".to_string(),
434 );
435
436 let all = tracker.get_all();
437 assert_eq!(all.len(), 2);
438 }
439
440 #[test]
441 fn test_count_by_category() {
442 let tracker = NaughtyListTracker::with_defaults();
443
444 tracker.record(
445 "wss://relay1.example.com",
446 NaughtyCategory::DnsLookupFailed,
447 "error".to_string(),
448 );
449 tracker.record(
450 "wss://relay2.example.com",
451 NaughtyCategory::DnsLookupFailed,
452 "error".to_string(),
453 );
454 tracker.record(
455 "wss://relay3.example.com",
456 NaughtyCategory::TlsCertificateInvalid,
457 "error".to_string(),
458 );
459
460 assert_eq!(
461 tracker.count_by_category(NaughtyCategory::DnsLookupFailed),
462 2
463 );
464 assert_eq!(
465 tracker.count_by_category(NaughtyCategory::TlsCertificateInvalid),
466 1
467 );
468 assert_eq!(tracker.count_by_category(NaughtyCategory::ProtocolError), 0);
469 }
470
471 #[test]
472 fn test_total_count() {
473 let tracker = NaughtyListTracker::with_defaults();
474 assert_eq!(tracker.total_count(), 0);
475
476 tracker.record(
477 "wss://relay1.example.com",
478 NaughtyCategory::DnsLookupFailed,
479 "error".to_string(),
480 );
481 assert_eq!(tracker.total_count(), 1);
482
483 tracker.record(
484 "wss://relay2.example.com",
485 NaughtyCategory::TlsCertificateInvalid,
486 "error".to_string(),
487 );
488 assert_eq!(tracker.total_count(), 2);
489 }
490
491 #[test]
492 fn test_expire_old_entries() {
493 // Use very short expiration for testing
494 let tracker = NaughtyListTracker::new(0); // Expire immediately (0 hours)
495
496 tracker.record(
497 "wss://relay1.example.com",
498 NaughtyCategory::DnsLookupFailed,
499 "error".to_string(),
500 );
501
502 // Entry should exist in the map
503 assert_eq!(tracker.total_count(), 1);
504
505 // But is_naughty should return false since it's already expired (0 hours)
506 assert!(!tracker.is_naughty("wss://relay1.example.com"));
507
508 // Sleep to ensure time passes
509 std::thread::sleep(std::time::Duration::from_millis(10));
510
511 // Expire old entries (should remove the 0-hour expired entry)
512 let expired = tracker.expire_old_entries();
513 assert_eq!(expired.len(), 1);
514 assert_eq!(expired[0], "wss://relay1.example.com");
515
516 // Entry should be gone
517 assert!(!tracker.is_naughty("wss://relay1.example.com"));
518 assert_eq!(tracker.total_count(), 0);
519 }
520
521 #[test]
522 fn test_category_display() {
523 assert_eq!(
524 NaughtyCategory::DnsLookupFailed.to_string(),
525 "dns_lookup_failed"
526 );
527 assert_eq!(
528 NaughtyCategory::TlsCertificateInvalid.to_string(),
529 "tls_certificate_invalid"
530 );
531 assert_eq!(NaughtyCategory::ProtocolError.to_string(), "protocol_error");
532 }
533
534 #[test]
535 fn test_category_as_str() {
536 assert_eq!(
537 NaughtyCategory::DnsLookupFailed.as_str(),
538 "dns_lookup_failed"
539 );
540 assert_eq!(
541 NaughtyCategory::TlsCertificateInvalid.as_str(),
542 "tls_certificate_invalid"
543 );
544 assert_eq!(NaughtyCategory::ProtocolError.as_str(), "protocol_error");
545 }
546}