From eb10e85f199266affd3bca0a3d4cd934f74f3e7f Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Fri, 9 Jan 2026 09:24:17 +0000 Subject: feat(sync): prevent infinite retry loop in negentropy validation Add retry protection to negentropy event validation: - Track retry_count in PendingBatch (incremented on each retry attempt) - Detect when retry makes zero progress (relay returns no requested events) - Abort retry and complete batch with partial results when stuck - Log error with full details when retry protection triggers This prevents infinite loops when: - Relay has bugs and returns wrong events for ID queries - Relay is malicious and returns unrelated events - Relay has eventual consistency issues - Network corruption causes incorrect responses The protection triggers when received_count == 0 on a retry (relay returned nothing we asked for), indicating the relay will never provide the missing events. Future work: Track failed batches in Prometheus metrics (sync_failed_batches_total) for monitoring and alerting. --- src/sync/algorithms.rs | 2 ++ src/sync/mod.rs | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'src/sync') diff --git a/src/sync/algorithms.rs b/src/sync/algorithms.rs index 4679986..e083dc8 100644 --- a/src/sync/algorithms.rs +++ b/src/sync/algorithms.rs @@ -404,6 +404,7 @@ mod tests { pagination_state: HashMap::new(), requested_event_ids: None, received_event_ids: None, + retry_count: 0, }], ); @@ -518,6 +519,7 @@ mod tests { pagination_state: HashMap::new(), requested_event_ids: None, received_event_ids: None, + retry_count: 0, }], ); diff --git a/src/sync/mod.rs b/src/sync/mod.rs index d33364f..00668ac 100644 --- a/src/sync/mod.rs +++ b/src/sync/mod.rs @@ -202,6 +202,9 @@ pub struct PendingBatch { /// Event IDs actually received for this batch (None for REQ+EOSE) /// Compared against requested_event_ids to detect missing events pub received_event_ids: Option>, + /// Number of retry attempts for missing events (Negentropy only) + /// Used to prevent infinite retry loops when relay consistently fails + pub retry_count: usize, } /// Items included in a pending batch @@ -651,10 +654,38 @@ impl SyncManager { if !missing.is_empty() { let requested_count = requested.len(); let received_count = received.len(); + let retry_count = batch.retry_count; + + // Check if we made any progress (received ANY events we requested) + // If received_count is 0, relay returned nothing useful - abort retry + if retry_count > 0 && received_count == 0 { + tracing::error!( + relay = %relay_url, + batch_id = batch.batch_id, + retry_count = retry_count, + requested_count = requested_count, + missing_count = missing.len(), + missing_ids = ?missing.iter().map(|id| id.to_hex()).collect::>(), + "Negentropy retry made no progress - relay returned zero requested events. \ + Aborting retry to prevent infinite loop. Completing batch with partial results." + // TODO: Track this failure in Prometheus metrics (sync_failed_batches_total) + ); + + // Extract and complete batch with partial results + let batch_idx_for_completion = batch_idx; + let completed_batch = batches.remove(batch_idx_for_completion); + if batches.is_empty() { + pending.remove(relay_url); + } + drop(pending); + self.confirm_batch(relay_url, completed_batch).await; + return; + } tracing::warn!( relay = %relay_url, batch_id = batch.batch_id, + retry_count = retry_count, requested_count = requested_count, received_count = received_count, missing_count = missing.len(), @@ -710,12 +741,15 @@ impl SyncManager { Some(missing.iter().cloned().collect()); // Clear received_event_ids for fresh tracking batch.received_event_ids = Some(HashSet::new()); + // Increment retry counter + batch.retry_count += 1; tracing::info!( relay = %relay_url_for_retry, batch_id = batch_id, retry_subs = new_sub_ids.len(), missing_events = missing.len(), + retry_attempt = batch.retry_count, "Created retry subscriptions for missing negentropy events" ); } @@ -2373,6 +2407,7 @@ impl SyncManager { pagination_state: HashMap::new(), // Negentropy doesn't use pagination requested_event_ids: None, // Will be set after negentropy diff received_event_ids: None, // Will be set after negentropy diff + retry_count: 0, }; // Add to pending_sync_index @@ -2610,6 +2645,7 @@ impl SyncManager { pagination_state, requested_event_ids: None, // Not used for REQ+EOSE received_event_ids: None, // Not used for REQ+EOSE + retry_count: 0, // Not used for REQ+EOSE }; // Add to pending_sync_index @@ -2822,11 +2858,13 @@ mod tests { pagination_state: HashMap::new(), requested_event_ids: Some(HashSet::new()), received_event_ids: Some(HashSet::new()), + retry_count: 0, }; assert!(batch.requested_event_ids.is_some()); assert!(batch.received_event_ids.is_some()); assert_eq!(batch.sync_method, SyncMethod::Negentropy); + assert_eq!(batch.retry_count, 0); } #[test] @@ -2840,6 +2878,7 @@ mod tests { pagination_state: HashMap::new(), requested_event_ids: None, received_event_ids: None, + retry_count: 0, }; assert!(batch.requested_event_ids.is_none()); -- cgit v1.2.3