upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/src/sync/naughty_list.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/sync/naughty_list.rs')
-rw-r--r--src/sync/naughty_list.rs546
1 files changed, 546 insertions, 0 deletions
diff --git a/src/sync/naughty_list.rs b/src/sync/naughty_list.rs
new file mode 100644
index 0000000..311b9bb
--- /dev/null
+++ b/src/sync/naughty_list.rs
@@ -0,0 +1,546 @@
1//! Naughty List Tracker for Relays with Persistent Infrastructure Issues
2//!
3//! This module tracks relays with persistent configuration/infrastructure problems
4//! (DNS failures, TLS certificate errors, protocol violations) separately from
5//! transient network issues (timeouts, connection refused).
6//!
7//! ## Failure Classification
8//!
9//! **Naughty List (12-hour expiration, log WARN on first occurrence, DEBUG on repeat):**
10//! - `DnsLookupFailed`: Domain doesn't resolve or DNS errors
11//! - `TlsCertificateInvalid`: Certificate errors (expired, mismatch, self-signed)
12//! - `ProtocolError`: WebSocket/Nostr protocol violations
13//!
14//! **NOT Naughty (use existing HealthTracker backoff):**
15//! - Connection timeouts (could be network congestion)
16//! - Connection refused (could be temporary maintenance)
17//!
18//! ## Automatic Expiration
19//!
20//! Entries expire after 12 hours (configurable) to allow relays to recover from
21//! infrastructure issues. After expiration, the relay is automatically retried.
22
23use dashmap::DashMap;
24use std::time::Instant;
25
26/// Category of persistent relay failure that qualifies for the naughty list
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum NaughtyCategory {
29 /// DNS lookup failures (domain doesn't resolve)
30 DnsLookupFailed,
31 /// TLS certificate errors (expired, invalid, mismatch)
32 TlsCertificateInvalid,
33 /// WebSocket or Nostr protocol violations
34 ProtocolError,
35}
36
37impl NaughtyCategory {
38 /// Get string representation for metrics labels
39 pub fn as_str(&self) -> &'static str {
40 match self {
41 NaughtyCategory::DnsLookupFailed => "dns_lookup_failed",
42 NaughtyCategory::TlsCertificateInvalid => "tls_certificate_invalid",
43 NaughtyCategory::ProtocolError => "protocol_error",
44 }
45 }
46}
47
48impl std::fmt::Display for NaughtyCategory {
49 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
50 write!(f, "{}", self.as_str())
51 }
52}
53
54/// Naughty list entry for a relay with persistent issues
55#[derive(Debug, Clone)]
56pub struct NaughtyEntry {
57 /// Category of the persistent failure
58 pub category: NaughtyCategory,
59 /// Full error message
60 pub reason: String,
61 /// When this relay was first added to the naughty list
62 pub first_seen: Instant,
63 /// Most recent occurrence of the issue
64 pub last_seen: Instant,
65 /// Number of times we've seen this issue
66 pub occurrence_count: u32,
67}
68
69/// Tracks relays with persistent infrastructure/configuration issues
70///
71/// Separate from HealthTracker's backoff logic - this is specifically for
72/// relays with configuration problems that are unlikely to be fixed quickly.
73#[derive(Debug)]
74pub struct NaughtyListTracker {
75 /// Map of relay URL to naughty entry
76 entries: DashMap<String, NaughtyEntry>,
77 /// How many hours before removing a relay from the naughty list
78 expiration_hours: u64,
79}
80
81impl NaughtyListTracker {
82 /// Create a new NaughtyListTracker with the specified expiration time
83 ///
84 /// # Arguments
85 ///
86 /// * `expiration_hours` - Hours before a naughty entry expires (default: 12)
87 pub fn new(expiration_hours: u64) -> Self {
88 Self {
89 entries: DashMap::new(),
90 expiration_hours,
91 }
92 }
93
94 /// Create a new NaughtyListTracker with default 12-hour expiration
95 pub fn with_defaults() -> Self {
96 Self::new(12)
97 }
98
99 /// Classify an error string into a naughty category or return None for transient errors
100 ///
101 /// # Arguments
102 ///
103 /// * `error` - The error message string to classify
104 ///
105 /// # Returns
106 ///
107 /// - `Some(NaughtyCategory)` if the error indicates a persistent infrastructure issue
108 /// - `None` if the error is a transient network issue (use HealthTracker backoff)
109 pub fn classify_error(error: &str) -> Option<NaughtyCategory> {
110 let error_lower = error.to_lowercase();
111
112 // DNS lookup failures
113 if error_lower.contains("failed to lookup address")
114 || error_lower.contains("name or service not known")
115 || error_lower.contains("nodename nor servname provided")
116 || (error_lower.contains("dns") && !error_lower.contains("timeout"))
117 {
118 return Some(NaughtyCategory::DnsLookupFailed);
119 }
120
121 // TLS certificate errors
122 if error_lower.contains("certificate")
123 || error_lower.contains("ssl")
124 || error_lower.contains("tls")
125 {
126 // Exclude timeout errors that mention TLS
127 if !error_lower.contains("timeout") && !error_lower.contains("timed out") {
128 return Some(NaughtyCategory::TlsCertificateInvalid);
129 }
130 }
131
132 // Protocol errors
133 if error_lower.contains("websocket")
134 || error_lower.contains("protocol")
135 || error_lower.contains("invalid frame")
136 {
137 // Exclude connection errors
138 if !error_lower.contains("connection")
139 && !error_lower.contains("timeout")
140 && !error_lower.contains("refused")
141 {
142 return Some(NaughtyCategory::ProtocolError);
143 }
144 }
145
146 // Everything else is transient (timeouts, refused, etc.)
147 None
148 }
149
150 /// Record a naughty relay (adds new entry or updates existing)
151 ///
152 /// # Arguments
153 ///
154 /// * `relay_url` - The relay URL
155 /// * `category` - The naughty category
156 /// * `reason` - The full error message
157 ///
158 /// # Returns
159 ///
160 /// `true` if this is a new naughty entry (first occurrence), `false` if updating existing
161 pub fn record(&self, relay_url: &str, category: NaughtyCategory, reason: String) -> bool {
162 let now = Instant::now();
163
164 if let Some(mut entry) = self.entries.get_mut(relay_url) {
165 // Update existing entry
166 entry.last_seen = now;
167 entry.occurrence_count = entry.occurrence_count.saturating_add(1);
168 entry.reason = reason; // Update with latest error message
169 false
170 } else {
171 // Create new entry
172 self.entries.insert(
173 relay_url.to_string(),
174 NaughtyEntry {
175 category,
176 reason,
177 first_seen: now,
178 last_seen: now,
179 occurrence_count: 1,
180 },
181 );
182 true
183 }
184 }
185
186 /// Check if a relay is on the naughty list (not expired)
187 ///
188 /// # Arguments
189 ///
190 /// * `relay_url` - The relay URL to check
191 ///
192 /// # Returns
193 ///
194 /// `true` if the relay is currently on the naughty list
195 pub fn is_naughty(&self, relay_url: &str) -> bool {
196 if let Some(entry) = self.entries.get(relay_url) {
197 let age = Instant::now().duration_since(entry.first_seen);
198 let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600);
199 age < expiration
200 } else {
201 false
202 }
203 }
204
205 /// Get a naughty entry if it exists and hasn't expired
206 ///
207 /// # Arguments
208 ///
209 /// * `relay_url` - The relay URL to look up
210 ///
211 /// # Returns
212 ///
213 /// A cloned `NaughtyEntry` if the relay is on the naughty list and not expired
214 pub fn get_entry(&self, relay_url: &str) -> Option<NaughtyEntry> {
215 self.entries.get(relay_url).map(|e| e.clone())
216 }
217
218 /// Remove expired entries from the naughty list
219 ///
220 /// Entries older than `expiration_hours` are removed to allow relays
221 /// to be retried after infrastructure issues are potentially fixed.
222 ///
223 /// # Returns
224 ///
225 /// Vector of relay URLs that were removed from the naughty list
226 pub fn expire_old_entries(&self) -> Vec<String> {
227 let now = Instant::now();
228 let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600);
229 let mut expired = Vec::new();
230
231 // Collect expired relay URLs
232 self.entries.retain(|url, entry| {
233 let age = now.duration_since(entry.first_seen);
234 if age >= expiration {
235 expired.push(url.clone());
236 false // Remove this entry
237 } else {
238 true // Keep this entry
239 }
240 });
241
242 expired
243 }
244
245 /// Get all naughty relays (for metrics and monitoring)
246 ///
247 /// # Returns
248 ///
249 /// Vector of (relay_url, entry) tuples for all relays currently on the naughty list
250 pub fn get_all(&self) -> Vec<(String, NaughtyEntry)> {
251 self.entries
252 .iter()
253 .map(|entry| (entry.key().clone(), entry.value().clone()))
254 .collect()
255 }
256
257 /// Get the count of relays in a specific category
258 ///
259 /// # Arguments
260 ///
261 /// * `category` - The category to count
262 ///
263 /// # Returns
264 ///
265 /// Number of relays in the specified category
266 pub fn count_by_category(&self, category: NaughtyCategory) -> usize {
267 self.entries
268 .iter()
269 .filter(|entry| entry.value().category == category)
270 .count()
271 }
272
273 /// Get total number of relays on the naughty list
274 pub fn total_count(&self) -> usize {
275 self.entries.len()
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 #[test]
284 fn test_classify_dns_errors() {
285 assert_eq!(
286 NaughtyListTracker::classify_error("failed to lookup address information"),
287 Some(NaughtyCategory::DnsLookupFailed)
288 );
289 assert_eq!(
290 NaughtyListTracker::classify_error("Name or service not known"),
291 Some(NaughtyCategory::DnsLookupFailed)
292 );
293 assert_eq!(
294 NaughtyListTracker::classify_error("nodename nor servname provided"),
295 Some(NaughtyCategory::DnsLookupFailed)
296 );
297 assert_eq!(
298 NaughtyListTracker::classify_error("dns error: NXDOMAIN"),
299 Some(NaughtyCategory::DnsLookupFailed)
300 );
301 }
302
303 #[test]
304 fn test_classify_tls_errors() {
305 assert_eq!(
306 NaughtyListTracker::classify_error("certificate not valid for 'example.com'"),
307 Some(NaughtyCategory::TlsCertificateInvalid)
308 );
309 assert_eq!(
310 NaughtyListTracker::classify_error("SSL certificate problem"),
311 Some(NaughtyCategory::TlsCertificateInvalid)
312 );
313 assert_eq!(
314 NaughtyListTracker::classify_error("TLS handshake failed"),
315 Some(NaughtyCategory::TlsCertificateInvalid)
316 );
317
318 // TLS timeout should NOT be classified as naughty
319 assert_eq!(
320 NaughtyListTracker::classify_error("TLS connection timed out"),
321 None
322 );
323 }
324
325 #[test]
326 fn test_classify_protocol_errors() {
327 assert_eq!(
328 NaughtyListTracker::classify_error("websocket protocol error"),
329 Some(NaughtyCategory::ProtocolError)
330 );
331 assert_eq!(
332 NaughtyListTracker::classify_error("invalid frame header"),
333 Some(NaughtyCategory::ProtocolError)
334 );
335
336 // WebSocket connection errors should NOT be classified as naughty
337 assert_eq!(
338 NaughtyListTracker::classify_error("websocket connection refused"),
339 None
340 );
341 }
342
343 #[test]
344 fn test_classify_transient_errors() {
345 // Timeouts are transient
346 assert_eq!(
347 NaughtyListTracker::classify_error("connection timed out"),
348 None
349 );
350 assert_eq!(
351 NaughtyListTracker::classify_error("operation timed out"),
352 None
353 );
354
355 // Connection refused is transient
356 assert_eq!(
357 NaughtyListTracker::classify_error("connection refused"),
358 None
359 );
360
361 // Generic network errors are transient
362 assert_eq!(
363 NaughtyListTracker::classify_error("network unreachable"),
364 None
365 );
366 }
367
368 #[test]
369 fn test_record_new_entry() {
370 let tracker = NaughtyListTracker::with_defaults();
371 let url = "wss://bad-relay.example.com";
372
373 let is_new = tracker.record(
374 url,
375 NaughtyCategory::DnsLookupFailed,
376 "failed to lookup address".to_string(),
377 );
378
379 assert!(is_new);
380 assert!(tracker.is_naughty(url));
381
382 let entry = tracker.get_entry(url).unwrap();
383 assert_eq!(entry.category, NaughtyCategory::DnsLookupFailed);
384 assert_eq!(entry.occurrence_count, 1);
385 }
386
387 #[test]
388 fn test_record_updates_existing() {
389 let tracker = NaughtyListTracker::with_defaults();
390 let url = "wss://bad-relay.example.com";
391
392 // First occurrence
393 let is_new1 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 1".to_string());
394 assert!(is_new1);
395
396 // Second occurrence
397 let is_new2 = tracker.record(url, NaughtyCategory::DnsLookupFailed, "error 2".to_string());
398 assert!(!is_new2);
399
400 let entry = tracker.get_entry(url).unwrap();
401 assert_eq!(entry.occurrence_count, 2);
402 assert_eq!(entry.reason, "error 2"); // Updated to latest
403 }
404
405 #[test]
406 fn test_is_naughty() {
407 let tracker = NaughtyListTracker::with_defaults();
408 let url = "wss://bad-relay.example.com";
409
410 assert!(!tracker.is_naughty(url));
411
412 tracker.record(
413 url,
414 NaughtyCategory::TlsCertificateInvalid,
415 "cert error".to_string(),
416 );
417
418 assert!(tracker.is_naughty(url));
419 }
420
421 #[test]
422 fn test_get_all() {
423 let tracker = NaughtyListTracker::with_defaults();
424
425 tracker.record(
426 "wss://relay1.example.com",
427 NaughtyCategory::DnsLookupFailed,
428 "dns error".to_string(),
429 );
430 tracker.record(
431 "wss://relay2.example.com",
432 NaughtyCategory::TlsCertificateInvalid,
433 "tls error".to_string(),
434 );
435
436 let all = tracker.get_all();
437 assert_eq!(all.len(), 2);
438 }
439
440 #[test]
441 fn test_count_by_category() {
442 let tracker = NaughtyListTracker::with_defaults();
443
444 tracker.record(
445 "wss://relay1.example.com",
446 NaughtyCategory::DnsLookupFailed,
447 "error".to_string(),
448 );
449 tracker.record(
450 "wss://relay2.example.com",
451 NaughtyCategory::DnsLookupFailed,
452 "error".to_string(),
453 );
454 tracker.record(
455 "wss://relay3.example.com",
456 NaughtyCategory::TlsCertificateInvalid,
457 "error".to_string(),
458 );
459
460 assert_eq!(
461 tracker.count_by_category(NaughtyCategory::DnsLookupFailed),
462 2
463 );
464 assert_eq!(
465 tracker.count_by_category(NaughtyCategory::TlsCertificateInvalid),
466 1
467 );
468 assert_eq!(tracker.count_by_category(NaughtyCategory::ProtocolError), 0);
469 }
470
471 #[test]
472 fn test_total_count() {
473 let tracker = NaughtyListTracker::with_defaults();
474 assert_eq!(tracker.total_count(), 0);
475
476 tracker.record(
477 "wss://relay1.example.com",
478 NaughtyCategory::DnsLookupFailed,
479 "error".to_string(),
480 );
481 assert_eq!(tracker.total_count(), 1);
482
483 tracker.record(
484 "wss://relay2.example.com",
485 NaughtyCategory::TlsCertificateInvalid,
486 "error".to_string(),
487 );
488 assert_eq!(tracker.total_count(), 2);
489 }
490
491 #[test]
492 fn test_expire_old_entries() {
493 // Use very short expiration for testing
494 let tracker = NaughtyListTracker::new(0); // Expire immediately (0 hours)
495
496 tracker.record(
497 "wss://relay1.example.com",
498 NaughtyCategory::DnsLookupFailed,
499 "error".to_string(),
500 );
501
502 // Entry should exist in the map
503 assert_eq!(tracker.total_count(), 1);
504
505 // But is_naughty should return false since it's already expired (0 hours)
506 assert!(!tracker.is_naughty("wss://relay1.example.com"));
507
508 // Sleep to ensure time passes
509 std::thread::sleep(std::time::Duration::from_millis(10));
510
511 // Expire old entries (should remove the 0-hour expired entry)
512 let expired = tracker.expire_old_entries();
513 assert_eq!(expired.len(), 1);
514 assert_eq!(expired[0], "wss://relay1.example.com");
515
516 // Entry should be gone
517 assert!(!tracker.is_naughty("wss://relay1.example.com"));
518 assert_eq!(tracker.total_count(), 0);
519 }
520
521 #[test]
522 fn test_category_display() {
523 assert_eq!(
524 NaughtyCategory::DnsLookupFailed.to_string(),
525 "dns_lookup_failed"
526 );
527 assert_eq!(
528 NaughtyCategory::TlsCertificateInvalid.to_string(),
529 "tls_certificate_invalid"
530 );
531 assert_eq!(NaughtyCategory::ProtocolError.to_string(), "protocol_error");
532 }
533
534 #[test]
535 fn test_category_as_str() {
536 assert_eq!(
537 NaughtyCategory::DnsLookupFailed.as_str(),
538 "dns_lookup_failed"
539 );
540 assert_eq!(
541 NaughtyCategory::TlsCertificateInvalid.as_str(),
542 "tls_certificate_invalid"
543 );
544 assert_eq!(NaughtyCategory::ProtocolError.as_str(), "protocol_error");
545 }
546}