diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 17:58:31 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 17:58:31 +0000 |
| commit | f639ecfac6687c9e8de4e3f305e168b2e4e1bb87 (patch) | |
| tree | cfcbf16a937a59048930ccaf8557f78ed5576bde /tests | |
| parent | bf558b0dc17e14f96eea624ea5591315a2909154 (diff) | |
feat(sync): Phase 3 - resilience and health tracking
- Add RelayHealthTracker with DashMap
- Implement exponential backoff (5s -> 1h max)
- Handle dead relays (24h failures -> daily retry)
- Add startup jitter to prevent thundering herd
- Add NGIT_SYNC_MAX_BACKOFF_SECS config
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/proactive_sync_resilience.rs | 476 |
1 files changed, 476 insertions, 0 deletions
diff --git a/tests/proactive_sync_resilience.rs b/tests/proactive_sync_resilience.rs new file mode 100644 index 0000000..60b18dd --- /dev/null +++ b/tests/proactive_sync_resilience.rs | |||
| @@ -0,0 +1,476 @@ | |||
| 1 | //! Integration tests for GRASP-02 Phase 3: Resilience & Health Tracking | ||
| 2 | //! | ||
| 3 | //! Tests verify: | ||
| 4 | //! - Exponential backoff on connection failures (5s → 1h max) | ||
| 5 | //! - Dead relay detection after 24h of failures | ||
| 6 | //! - Successful connection resets to Healthy | ||
| 7 | //! - Dead relays retry minimally (once per day) | ||
| 8 | //! - Health state tracking is thread-safe | ||
| 9 | |||
| 10 | use std::time::{Duration, Instant}; | ||
| 11 | |||
| 12 | use ngit_grasp::sync::health::{HealthState, RelayHealthTracker}; | ||
| 13 | |||
| 14 | /// Test that a single failure transitions relay to Degraded state | ||
| 15 | #[test] | ||
| 16 | fn test_single_failure_causes_degraded_state() { | ||
| 17 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 18 | let url = "wss://test-relay.example.com"; | ||
| 19 | |||
| 20 | // Initial state should allow connection | ||
| 21 | assert!(tracker.should_attempt_connection(url)); | ||
| 22 | |||
| 23 | // Record a failure | ||
| 24 | tracker.record_failure(url); | ||
| 25 | |||
| 26 | // Should be in degraded state | ||
| 27 | assert_eq!(tracker.get_state(url), HealthState::Degraded); | ||
| 28 | assert_eq!(tracker.get_failure_count(url), 1); | ||
| 29 | } | ||
| 30 | |||
| 31 | /// Test that successful connection resets to Healthy state | ||
| 32 | #[test] | ||
| 33 | fn test_success_resets_to_healthy() { | ||
| 34 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 35 | let url = "wss://test-relay.example.com"; | ||
| 36 | |||
| 37 | // Simulate multiple failures | ||
| 38 | tracker.record_failure(url); | ||
| 39 | tracker.record_failure(url); | ||
| 40 | tracker.record_failure(url); | ||
| 41 | |||
| 42 | assert_eq!(tracker.get_state(url), HealthState::Degraded); | ||
| 43 | assert_eq!(tracker.get_failure_count(url), 3); | ||
| 44 | |||
| 45 | // Success should reset everything | ||
| 46 | tracker.record_success(url); | ||
| 47 | |||
| 48 | assert_eq!(tracker.get_state(url), HealthState::Healthy); | ||
| 49 | assert_eq!(tracker.get_failure_count(url), 0); | ||
| 50 | assert!(tracker.should_attempt_connection(url)); | ||
| 51 | } | ||
| 52 | |||
| 53 | /// Test that backoff increases exponentially | ||
| 54 | #[test] | ||
| 55 | fn test_exponential_backoff_calculation() { | ||
| 56 | let max_backoff = 3600u64; // 1 hour | ||
| 57 | |||
| 58 | // failure 1: 5s (5 * 2^0) | ||
| 59 | assert_eq!( | ||
| 60 | RelayHealthTracker::get_backoff_duration(1, max_backoff), | ||
| 61 | Duration::from_secs(5) | ||
| 62 | ); | ||
| 63 | |||
| 64 | // failure 2: 10s (5 * 2^1) | ||
| 65 | assert_eq!( | ||
| 66 | RelayHealthTracker::get_backoff_duration(2, max_backoff), | ||
| 67 | Duration::from_secs(10) | ||
| 68 | ); | ||
| 69 | |||
| 70 | // failure 3: 20s (5 * 2^2) | ||
| 71 | assert_eq!( | ||
| 72 | RelayHealthTracker::get_backoff_duration(3, max_backoff), | ||
| 73 | Duration::from_secs(20) | ||
| 74 | ); | ||
| 75 | |||
| 76 | // failure 4: 40s (5 * 2^3) | ||
| 77 | assert_eq!( | ||
| 78 | RelayHealthTracker::get_backoff_duration(4, max_backoff), | ||
| 79 | Duration::from_secs(40) | ||
| 80 | ); | ||
| 81 | |||
| 82 | // failure 5: 80s (5 * 2^4) | ||
| 83 | assert_eq!( | ||
| 84 | RelayHealthTracker::get_backoff_duration(5, max_backoff), | ||
| 85 | Duration::from_secs(80) | ||
| 86 | ); | ||
| 87 | |||
| 88 | // failure 6: 160s (5 * 2^5) | ||
| 89 | assert_eq!( | ||
| 90 | RelayHealthTracker::get_backoff_duration(6, max_backoff), | ||
| 91 | Duration::from_secs(160) | ||
| 92 | ); | ||
| 93 | |||
| 94 | // failure 7: 320s (5 * 2^6) | ||
| 95 | assert_eq!( | ||
| 96 | RelayHealthTracker::get_backoff_duration(7, max_backoff), | ||
| 97 | Duration::from_secs(320) | ||
| 98 | ); | ||
| 99 | |||
| 100 | // failure 8: 640s (5 * 2^7) | ||
| 101 | assert_eq!( | ||
| 102 | RelayHealthTracker::get_backoff_duration(8, max_backoff), | ||
| 103 | Duration::from_secs(640) | ||
| 104 | ); | ||
| 105 | |||
| 106 | // failure 9: 1280s (5 * 2^8) | ||
| 107 | assert_eq!( | ||
| 108 | RelayHealthTracker::get_backoff_duration(9, max_backoff), | ||
| 109 | Duration::from_secs(1280) | ||
| 110 | ); | ||
| 111 | |||
| 112 | // failure 10: 2560s (5 * 2^9) | ||
| 113 | assert_eq!( | ||
| 114 | RelayHealthTracker::get_backoff_duration(10, max_backoff), | ||
| 115 | Duration::from_secs(2560) | ||
| 116 | ); | ||
| 117 | } | ||
| 118 | |||
| 119 | /// Test that backoff is capped at max_backoff | ||
| 120 | #[test] | ||
| 121 | fn test_backoff_capped_at_maximum() { | ||
| 122 | let max_backoff = 3600u64; // 1 hour | ||
| 123 | |||
| 124 | // After many failures, should cap at max_backoff | ||
| 125 | assert_eq!( | ||
| 126 | RelayHealthTracker::get_backoff_duration(15, max_backoff), | ||
| 127 | Duration::from_secs(max_backoff) | ||
| 128 | ); | ||
| 129 | |||
| 130 | assert_eq!( | ||
| 131 | RelayHealthTracker::get_backoff_duration(20, max_backoff), | ||
| 132 | Duration::from_secs(max_backoff) | ||
| 133 | ); | ||
| 134 | |||
| 135 | assert_eq!( | ||
| 136 | RelayHealthTracker::get_backoff_duration(100, max_backoff), | ||
| 137 | Duration::from_secs(max_backoff) | ||
| 138 | ); | ||
| 139 | } | ||
| 140 | |||
| 141 | /// Test that custom max_backoff is respected | ||
| 142 | #[test] | ||
| 143 | fn test_custom_max_backoff() { | ||
| 144 | let custom_max = 60u64; // 1 minute max | ||
| 145 | |||
| 146 | // After several failures, should cap at custom max | ||
| 147 | assert_eq!( | ||
| 148 | RelayHealthTracker::get_backoff_duration(10, custom_max), | ||
| 149 | Duration::from_secs(custom_max) | ||
| 150 | ); | ||
| 151 | |||
| 152 | // Tracker with custom max should use it | ||
| 153 | let tracker = RelayHealthTracker::with_max_backoff(custom_max); | ||
| 154 | let url = "wss://test-relay.example.com"; | ||
| 155 | |||
| 156 | // Simulate many failures | ||
| 157 | for _ in 0..20 { | ||
| 158 | tracker.record_failure(url); | ||
| 159 | } | ||
| 160 | |||
| 161 | // Should still be degraded (not dead without 24h) | ||
| 162 | assert_eq!(tracker.get_state(url), HealthState::Degraded); | ||
| 163 | } | ||
| 164 | |||
| 165 | /// Test that backoff blocks immediate reconnection | ||
| 166 | #[test] | ||
| 167 | fn test_backoff_blocks_immediate_reconnection() { | ||
| 168 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 169 | let url = "wss://test-relay.example.com"; | ||
| 170 | |||
| 171 | // First connection attempt should be allowed | ||
| 172 | assert!(tracker.should_attempt_connection(url)); | ||
| 173 | |||
| 174 | // Record a failure | ||
| 175 | tracker.record_failure(url); | ||
| 176 | |||
| 177 | // Immediately after failure, connection should be blocked (backoff active) | ||
| 178 | assert!(!tracker.should_attempt_connection(url)); | ||
| 179 | |||
| 180 | // Should have remaining backoff | ||
| 181 | let remaining = tracker.get_remaining_backoff(url); | ||
| 182 | assert!(remaining.is_some()); | ||
| 183 | assert!(remaining.unwrap() > Duration::ZERO); | ||
| 184 | } | ||
| 185 | |||
| 186 | /// Test that multiple relays are tracked independently | ||
| 187 | #[test] | ||
| 188 | fn test_multiple_relays_independent() { | ||
| 189 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 190 | let url1 = "wss://relay1.example.com"; | ||
| 191 | let url2 = "wss://relay2.example.com"; | ||
| 192 | let url3 = "wss://relay3.example.com"; | ||
| 193 | |||
| 194 | // Fail relay1 multiple times | ||
| 195 | tracker.record_failure(url1); | ||
| 196 | tracker.record_failure(url1); | ||
| 197 | tracker.record_failure(url1); | ||
| 198 | |||
| 199 | // Succeed on relay2 | ||
| 200 | tracker.record_success(url2); | ||
| 201 | |||
| 202 | // Fail relay3 once | ||
| 203 | tracker.record_failure(url3); | ||
| 204 | |||
| 205 | // Verify independent states | ||
| 206 | assert_eq!(tracker.get_state(url1), HealthState::Degraded); | ||
| 207 | assert_eq!(tracker.get_failure_count(url1), 3); | ||
| 208 | |||
| 209 | assert_eq!(tracker.get_state(url2), HealthState::Healthy); | ||
| 210 | assert_eq!(tracker.get_failure_count(url2), 0); | ||
| 211 | |||
| 212 | assert_eq!(tracker.get_state(url3), HealthState::Degraded); | ||
| 213 | assert_eq!(tracker.get_failure_count(url3), 1); | ||
| 214 | } | ||
| 215 | |||
| 216 | /// Test is_dead returns false for degraded relays | ||
| 217 | #[test] | ||
| 218 | fn test_is_dead_false_for_degraded() { | ||
| 219 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 220 | let url = "wss://test-relay.example.com"; | ||
| 221 | |||
| 222 | // Simulate failures | ||
| 223 | for _ in 0..10 { | ||
| 224 | tracker.record_failure(url); | ||
| 225 | } | ||
| 226 | |||
| 227 | // Should be degraded but not dead (24h hasn't passed) | ||
| 228 | assert_eq!(tracker.get_state(url), HealthState::Degraded); | ||
| 229 | assert!(!tracker.is_dead(url)); | ||
| 230 | } | ||
| 231 | |||
| 232 | /// Test get_tracked_relays returns all tracked URLs | ||
| 233 | #[test] | ||
| 234 | fn test_get_tracked_relays() { | ||
| 235 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 236 | |||
| 237 | // Track multiple relays | ||
| 238 | tracker.record_success("wss://relay1.example.com"); | ||
| 239 | tracker.record_failure("wss://relay2.example.com"); | ||
| 240 | tracker.record_success("wss://relay3.example.com"); | ||
| 241 | |||
| 242 | let tracked = tracker.get_tracked_relays(); | ||
| 243 | assert_eq!(tracked.len(), 3); | ||
| 244 | assert!(tracked.contains(&"wss://relay1.example.com".to_string())); | ||
| 245 | assert!(tracked.contains(&"wss://relay2.example.com".to_string())); | ||
| 246 | assert!(tracked.contains(&"wss://relay3.example.com".to_string())); | ||
| 247 | } | ||
| 248 | |||
| 249 | /// Test get_health returns cloned health info | ||
| 250 | #[test] | ||
| 251 | fn test_get_health_returns_clone() { | ||
| 252 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 253 | let url = "wss://test-relay.example.com"; | ||
| 254 | |||
| 255 | // Record success | ||
| 256 | tracker.record_success(url); | ||
| 257 | |||
| 258 | // Get health info | ||
| 259 | let health = tracker.get_health(url); | ||
| 260 | assert!(health.is_some()); | ||
| 261 | |||
| 262 | let health = health.unwrap(); | ||
| 263 | assert_eq!(health.state, HealthState::Healthy); | ||
| 264 | assert!(health.last_success_time.is_some()); | ||
| 265 | assert_eq!(health.consecutive_failures, 0); | ||
| 266 | } | ||
| 267 | |||
| 268 | /// Test get_health returns None for non-existent relay | ||
| 269 | #[test] | ||
| 270 | fn test_get_health_nonexistent() { | ||
| 271 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 272 | |||
| 273 | let health = tracker.get_health("wss://nonexistent.example.com"); | ||
| 274 | assert!(health.is_none()); | ||
| 275 | } | ||
| 276 | |||
| 277 | /// Test that new relays default to allowing connection | ||
| 278 | #[test] | ||
| 279 | fn test_new_relay_allows_connection() { | ||
| 280 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 281 | |||
| 282 | // A never-seen relay should allow connection | ||
| 283 | assert!(tracker.should_attempt_connection("wss://brand-new-relay.example.com")); | ||
| 284 | } | ||
| 285 | |||
| 286 | /// Test health state display | ||
| 287 | #[test] | ||
| 288 | fn test_health_state_display() { | ||
| 289 | assert_eq!(HealthState::Healthy.to_string(), "healthy"); | ||
| 290 | assert_eq!(HealthState::Degraded.to_string(), "degraded"); | ||
| 291 | assert_eq!(HealthState::Dead.to_string(), "dead"); | ||
| 292 | } | ||
| 293 | |||
| 294 | /// Test thread safety with concurrent access | ||
| 295 | #[tokio::test] | ||
| 296 | async fn test_concurrent_health_tracking() { | ||
| 297 | use std::sync::Arc; | ||
| 298 | |||
| 299 | let tracker = Arc::new(RelayHealthTracker::with_defaults()); | ||
| 300 | let url = "wss://concurrent-test-relay.example.com"; | ||
| 301 | |||
| 302 | // Spawn multiple tasks that access the tracker concurrently | ||
| 303 | let mut handles = vec![]; | ||
| 304 | |||
| 305 | for i in 0..10 { | ||
| 306 | let tracker_clone = tracker.clone(); | ||
| 307 | let url_owned = url.to_string(); | ||
| 308 | let handle = tokio::spawn(async move { | ||
| 309 | if i % 2 == 0 { | ||
| 310 | tracker_clone.record_failure(&url_owned); | ||
| 311 | } else { | ||
| 312 | tracker_clone.record_success(&url_owned); | ||
| 313 | } | ||
| 314 | tracker_clone.get_state(&url_owned); | ||
| 315 | tracker_clone.should_attempt_connection(&url_owned); | ||
| 316 | }); | ||
| 317 | handles.push(handle); | ||
| 318 | } | ||
| 319 | |||
| 320 | // Wait for all tasks | ||
| 321 | for handle in handles { | ||
| 322 | handle.await.unwrap(); | ||
| 323 | } | ||
| 324 | |||
| 325 | // Tracker should still be usable | ||
| 326 | let health = tracker.get_health(url); | ||
| 327 | assert!(health.is_some()); | ||
| 328 | } | ||
| 329 | |||
| 330 | /// Test that failure streak tracking works correctly | ||
| 331 | #[test] | ||
| 332 | fn test_failure_streak_tracking() { | ||
| 333 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 334 | let url = "wss://test-relay.example.com"; | ||
| 335 | |||
| 336 | // Build up a failure streak | ||
| 337 | for i in 1..=5 { | ||
| 338 | tracker.record_failure(url); | ||
| 339 | assert_eq!(tracker.get_failure_count(url), i); | ||
| 340 | } | ||
| 341 | |||
| 342 | // Success should reset the streak | ||
| 343 | tracker.record_success(url); | ||
| 344 | assert_eq!(tracker.get_failure_count(url), 0); | ||
| 345 | |||
| 346 | // Start a new streak | ||
| 347 | tracker.record_failure(url); | ||
| 348 | assert_eq!(tracker.get_failure_count(url), 1); | ||
| 349 | } | ||
| 350 | |||
| 351 | /// Test recovery from degraded state | ||
| 352 | #[test] | ||
| 353 | fn test_recovery_from_degraded() { | ||
| 354 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 355 | let url = "wss://test-relay.example.com"; | ||
| 356 | |||
| 357 | // Enter degraded state | ||
| 358 | tracker.record_failure(url); | ||
| 359 | assert_eq!(tracker.get_state(url), HealthState::Degraded); | ||
| 360 | |||
| 361 | // Recover | ||
| 362 | tracker.record_success(url); | ||
| 363 | assert_eq!(tracker.get_state(url), HealthState::Healthy); | ||
| 364 | assert!(tracker.should_attempt_connection(url)); | ||
| 365 | assert!(tracker.get_remaining_backoff(url).is_none()); | ||
| 366 | } | ||
| 367 | |||
| 368 | /// Test that remaining backoff is None after success | ||
| 369 | #[test] | ||
| 370 | fn test_no_remaining_backoff_after_success() { | ||
| 371 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 372 | let url = "wss://test-relay.example.com"; | ||
| 373 | |||
| 374 | // Fail to set backoff | ||
| 375 | tracker.record_failure(url); | ||
| 376 | assert!(tracker.get_remaining_backoff(url).is_some()); | ||
| 377 | |||
| 378 | // Succeed to clear backoff | ||
| 379 | tracker.record_success(url); | ||
| 380 | assert!(tracker.get_remaining_backoff(url).is_none()); | ||
| 381 | } | ||
| 382 | |||
| 383 | /// Integration test: simulate a realistic connection lifecycle | ||
| 384 | #[test] | ||
| 385 | fn test_realistic_connection_lifecycle() { | ||
| 386 | let tracker = RelayHealthTracker::with_max_backoff(60); // 1 minute max for test | ||
| 387 | let url = "wss://production-relay.example.com"; | ||
| 388 | |||
| 389 | // Initial connection succeeds | ||
| 390 | tracker.record_success(url); | ||
| 391 | assert_eq!(tracker.get_state(url), HealthState::Healthy); | ||
| 392 | |||
| 393 | // Connection drops - first failure | ||
| 394 | tracker.record_failure(url); | ||
| 395 | assert_eq!(tracker.get_state(url), HealthState::Degraded); | ||
| 396 | assert_eq!(tracker.get_failure_count(url), 1); | ||
| 397 | |||
| 398 | // Second failure (retry failed) | ||
| 399 | tracker.record_failure(url); | ||
| 400 | assert_eq!(tracker.get_failure_count(url), 2); | ||
| 401 | |||
| 402 | // Third failure | ||
| 403 | tracker.record_failure(url); | ||
| 404 | assert_eq!(tracker.get_failure_count(url), 3); | ||
| 405 | |||
| 406 | // Connection finally succeeds | ||
| 407 | tracker.record_success(url); | ||
| 408 | assert_eq!(tracker.get_state(url), HealthState::Healthy); | ||
| 409 | assert_eq!(tracker.get_failure_count(url), 0); | ||
| 410 | assert!(tracker.should_attempt_connection(url)); | ||
| 411 | } | ||
| 412 | |||
| 413 | /// Test backoff timing sequence | ||
| 414 | #[test] | ||
| 415 | fn test_backoff_timing_sequence() { | ||
| 416 | // With default max of 3600s (1 hour), verify the progression | ||
| 417 | let max = 3600u64; | ||
| 418 | |||
| 419 | let expected = vec![ | ||
| 420 | (1, 5), // 5s | ||
| 421 | (2, 10), // 10s | ||
| 422 | (3, 20), // 20s | ||
| 423 | (4, 40), // 40s | ||
| 424 | (5, 80), // 80s | ||
| 425 | (6, 160), // 160s (~2.7 min) | ||
| 426 | (7, 320), // 320s (~5.3 min) | ||
| 427 | (8, 640), // 640s (~10.7 min) | ||
| 428 | (9, 1280), // 1280s (~21.3 min) | ||
| 429 | (10, 2560), // 2560s (~42.7 min) | ||
| 430 | (11, 3600), // capped at 3600s (1 hour) | ||
| 431 | (12, 3600), // still capped | ||
| 432 | ]; | ||
| 433 | |||
| 434 | for (failures, expected_secs) in expected { | ||
| 435 | assert_eq!( | ||
| 436 | RelayHealthTracker::get_backoff_duration(failures, max), | ||
| 437 | Duration::from_secs(expected_secs), | ||
| 438 | "Failed for {} failures", | ||
| 439 | failures | ||
| 440 | ); | ||
| 441 | } | ||
| 442 | } | ||
| 443 | |||
| 444 | /// Test that health info timestamp tracking works | ||
| 445 | #[test] | ||
| 446 | fn test_timestamp_tracking() { | ||
| 447 | let tracker = RelayHealthTracker::with_defaults(); | ||
| 448 | let url = "wss://test-relay.example.com"; | ||
| 449 | |||
| 450 | // Record initial success | ||
| 451 | let before = Instant::now(); | ||
| 452 | tracker.record_success(url); | ||
| 453 | let after = Instant::now(); | ||
| 454 | |||
| 455 | let health = tracker.get_health(url).unwrap(); | ||
| 456 | let success_time = health.last_success_time.unwrap(); | ||
| 457 | |||
| 458 | // Success time should be between before and after | ||
| 459 | assert!(success_time >= before); | ||
| 460 | assert!(success_time <= after); | ||
| 461 | |||
| 462 | // Record failure | ||
| 463 | let before_fail = Instant::now(); | ||
| 464 | tracker.record_failure(url); | ||
| 465 | let after_fail = Instant::now(); | ||
| 466 | |||
| 467 | let health = tracker.get_health(url).unwrap(); | ||
| 468 | let failure_time = health.last_failure_time.unwrap(); | ||
| 469 | let first_failure = health.first_failure_time.unwrap(); | ||
| 470 | |||
| 471 | // Failure times should be between before and after | ||
| 472 | assert!(failure_time >= before_fail); | ||
| 473 | assert!(failure_time <= after_fail); | ||
| 474 | assert!(first_failure >= before_fail); | ||
| 475 | assert!(first_failure <= after_fail); | ||
| 476 | } \ No newline at end of file | ||