diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-09 14:12:24 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-09 14:12:24 +0000 |
| commit | 93a1684f068603b354ba3c05957a25459c73de05 (patch) | |
| tree | 324e6d0e2a6a34fd4804ef94133cd35233081bb9 /src/sync/metrics.rs | |
| parent | c34492069abacae67482af4c8356241958a524f7 (diff) | |
feat(sync): add ConnectedDegraded status for failed historic sync
- Add ConnectionStatus::ConnectedDegraded (status=4 in metrics)
- Track batch failures via PendingBatch.failed field
- Track relay-level failures via RelayState.historic_sync_had_failures
- Transition to ConnectedDegraded when any batch fails during historic sync
- Add is_live_sync_active() helper for cleaner match patterns
- Update state machine diagram with ConnectedDegraded transitions
- Update metrics docs with status=4 and example queries
Fixes issue where relays with failed negentropy retries would
incorrectly transition to Connected status despite missing data.
Now operators can distinguish 'fully synced' vs 'degraded (partial data)'.
Diffstat (limited to 'src/sync/metrics.rs')
| -rw-r--r-- | src/sync/metrics.rs | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs index db7dd20..0f56911 100644 --- a/src/sync/metrics.rs +++ b/src/sync/metrics.rs | |||
| @@ -53,7 +53,7 @@ impl SyncMetrics { | |||
| 53 | let relay_connected = IntGaugeVec::new( | 53 | let relay_connected = IntGaugeVec::new( |
| 54 | Opts::new( | 54 | Opts::new( |
| 55 | "ngit_sync_relay_connected", | 55 | "ngit_sync_relay_connected", |
| 56 | "Relay connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected)", | 56 | "Relay connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected, 4=connected_degraded)", |
| 57 | ), | 57 | ), |
| 58 | &["relay"], | 58 | &["relay"], |
| 59 | )?; | 59 | )?; |
| @@ -208,6 +208,7 @@ impl SyncMetrics { | |||
| 208 | /// - Connecting = 1 (connection attempt in progress) | 208 | /// - Connecting = 1 (connection attempt in progress) |
| 209 | /// - Syncing = 2 (connected, historic sync in progress) | 209 | /// - Syncing = 2 (connected, historic sync in progress) |
| 210 | /// - Connected = 3 (connected, historic sync complete) | 210 | /// - Connected = 3 (connected, historic sync complete) |
| 211 | /// - ConnectedDegraded = 4 (connected, historic sync failed but live sync active) | ||
| 211 | /// | 212 | /// |
| 212 | /// This is separate from health state and provides more granular connection lifecycle tracking. | 213 | /// This is separate from health state and provides more granular connection lifecycle tracking. |
| 213 | /// | 214 | /// |
| @@ -222,6 +223,7 @@ impl SyncMetrics { | |||
| 222 | ConnectionStatus::Connecting => 1, | 223 | ConnectionStatus::Connecting => 1, |
| 223 | ConnectionStatus::Syncing => 2, | 224 | ConnectionStatus::Syncing => 2, |
| 224 | ConnectionStatus::Connected => 3, | 225 | ConnectionStatus::Connected => 3, |
| 226 | ConnectionStatus::ConnectedDegraded => 4, | ||
| 225 | }; | 227 | }; |
| 226 | self.relay_connected | 228 | self.relay_connected |
| 227 | .with_label_values(&[relay]) | 229 | .with_label_values(&[relay]) |