diff options
| -rw-r--r-- | docs/explanation/grasp-02-proactive-sync.md | 20 | ||||
| -rw-r--r-- | docs/explanation/monitoring.md | 10 | ||||
| -rw-r--r-- | src/sync/metrics.rs | 6 | ||||
| -rw-r--r-- | src/sync/mod.rs | 12 |
4 files changed, 24 insertions, 24 deletions
diff --git a/docs/explanation/grasp-02-proactive-sync.md b/docs/explanation/grasp-02-proactive-sync.md index b17b8bf..e983316 100644 --- a/docs/explanation/grasp-02-proactive-sync.md +++ b/docs/explanation/grasp-02-proactive-sync.md | |||
| @@ -79,8 +79,8 @@ pub enum ConnectionStatus { | |||
| 79 | Syncing, | 79 | Syncing, |
| 80 | /// Successfully connected, historic sync completed | 80 | /// Successfully connected, historic sync completed |
| 81 | Connected, | 81 | Connected, |
| 82 | /// Successfully connected, historic sync failed but live sync active | 82 | /// Successfully connected, historic sync had failures but live sync active |
| 83 | ConnectedDegraded, | 83 | ConnectedHistoricSyncFailures, |
| 84 | } | 84 | } |
| 85 | 85 | ||
| 86 | /// Complete state for a single relay - combines sync needs with connection lifecycle | 86 | /// Complete state for a single relay - combines sync needs with connection lifecycle |
| @@ -210,18 +210,18 @@ stateDiagram-v2 | |||
| 210 | Connecting --> Syncing: success → handle_connect_or_reconnect | 210 | Connecting --> Syncing: success → handle_connect_or_reconnect |
| 211 | Connecting --> Disconnected: failure + record in health tracker | 211 | Connecting --> Disconnected: failure + record in health tracker |
| 212 | Syncing --> Connected: all batches succeed → check_and_complete_historic_sync | 212 | Syncing --> Connected: all batches succeed → check_and_complete_historic_sync |
| 213 | Syncing --> ConnectedDegraded: any batch failed → check_and_complete_historic_sync | 213 | Syncing --> ConnectedHistoricSyncFailures: any batch failed → check_and_complete_historic_sync |
| 214 | Syncing --> Disconnected: connection lost → handle_disconnect | 214 | Syncing --> Disconnected: connection lost → handle_disconnect |
| 215 | Connected --> Disconnected: connection lost → handle_disconnect | 215 | Connected --> Disconnected: connection lost → handle_disconnect |
| 216 | ConnectedDegraded --> Disconnected: connection lost → handle_disconnect | 216 | ConnectedHistoricSyncFailures --> Disconnected: connection lost → handle_disconnect |
| 217 | Connected --> [*]: intentional disconnect via check_disconnects | 217 | Connected --> [*]: intentional disconnect via check_disconnects |
| 218 | ConnectedDegraded --> [*]: intentional disconnect via check_disconnects | 218 | ConnectedHistoricSyncFailures --> [*]: intentional disconnect via check_disconnects |
| 219 | 219 | ||
| 220 | note right of Disconnected: disconnected_at set for 15min rule<br/>RelayConnection kept in HashMap | 220 | note right of Disconnected: disconnected_at set for 15min rule<br/>RelayConnection kept in HashMap |
| 221 | note right of Connecting: connection attempt with timeout | 221 | note right of Connecting: connection attempt with timeout |
| 222 | note right of Syncing: historic sync in progress<br/>event loop spawned here | 222 | note right of Syncing: historic sync in progress<br/>event loop spawned here |
| 223 | note right of Connected: historic sync complete<br/>last_connected tracked for since filter | 223 | note right of Connected: historic sync complete<br/>last_connected tracked for since filter |
| 224 | note right of ConnectedDegraded: historic sync failed (missing events)<br/>live sync active, partial data | 224 | note right of ConnectedHistoricSyncFailures: historic sync had failures (missing events)<br/>live sync active, partial data |
| 225 | ``` | 225 | ``` |
| 226 | 226 | ||
| 227 | ### Connection Flow Methods | 227 | ### Connection Flow Methods |
| @@ -252,22 +252,22 @@ Each layer creates one or more `PendingBatch` entries tracked in `PendingSyncInd | |||
| 252 | 2. Wait 6 seconds (batch window + buffer) for self-subscriber to process in-flight events | 252 | 2. Wait 6 seconds (batch window + buffer) for self-subscriber to process in-flight events |
| 253 | 3. Second check: Are there still no pending batches? If yes, return early | 253 | 3. Second check: Are there still no pending batches? If yes, return early |
| 254 | 4. If no pending batches after wait: | 254 | 4. If no pending batches after wait: |
| 255 | - If any batch failed: transition `Syncing` → `ConnectedDegraded` | 255 | - If any batch failed: transition `Syncing` → `ConnectedHistoricSyncFailures` |
| 256 | - If all batches succeeded: transition `Syncing` → `Connected` | 256 | - If all batches succeeded: transition `Syncing` → `Connected` |
| 257 | - Set `historic_sync_completed = true` | 257 | - Set `historic_sync_completed = true` |
| 258 | 258 | ||
| 259 | **Why the double-check?** There's an async gap between receiving EOSE and the self-subscriber processing events to create Layer 2/3 filters. The 6-second wait (5s batch window + 1s buffer) ensures we don't prematurely mark sync complete while Layer 2/3 batches are being created. | 259 | **Why the double-check?** There's an async gap between receiving EOSE and the self-subscriber processing events to create Layer 2/3 filters. The 6-second wait (5s batch window + 1s buffer) ensures we don't prematurely mark sync complete while Layer 2/3 batches are being created. |
| 260 | 260 | ||
| 261 | **Batch Failure Tracking**: When negentropy retry protection triggers (relay returns zero requested events on retry), the batch is marked as `failed = true`. This causes the relay to transition to `ConnectedDegraded` instead of `Connected`, signaling that live sync is active but historic sync is incomplete. | 261 | **Batch Failure Tracking**: When negentropy retry protection triggers (relay returns zero requested events on retry), the batch is marked as `failed = true`. This causes the relay to transition to `ConnectedHistoricSyncFailures` instead of `Connected`, signaling that live sync is active but historic sync is incomplete. |
| 262 | 262 | ||
| 263 | **Metrics tracking**: The `ngit_sync_relay_connected` metric shows: | 263 | **Metrics tracking**: The `ngit_sync_relay_connected` metric shows: |
| 264 | - `0` = Disconnected | 264 | - `0` = Disconnected |
| 265 | - `1` = Connecting | 265 | - `1` = Connecting |
| 266 | - `2` = Syncing (historic sync in progress) | 266 | - `2` = Syncing (historic sync in progress) |
| 267 | - `3` = Connected (historic sync complete, live sync active) | 267 | - `3` = Connected (historic sync complete, live sync active) |
| 268 | - `4` = ConnectedDegraded (historic sync failed, live sync active, partial data) | 268 | - `4` = ConnectedHistoricSyncFailures (historic sync had failures, live sync active, partial data) |
| 269 | 269 | ||
| 270 | This allows operators to monitor sync progress and distinguish between "connected but still catching up" vs "fully synced and live" vs "degraded (missing historic data)". | 270 | This allows operators to monitor sync progress and distinguish between "connected but still catching up" vs "fully synced and live" vs "historic sync failures (missing historic data)". |
| 271 | 271 | ||
| 272 | ### Event Loop Lifecycle | 272 | ### Event Loop Lifecycle |
| 273 | 273 | ||
diff --git a/docs/explanation/monitoring.md b/docs/explanation/monitoring.md index cc164ab..7520813 100644 --- a/docs/explanation/monitoring.md +++ b/docs/explanation/monitoring.md | |||
| @@ -98,7 +98,7 @@ When GRASP-02 proactive sync is implemented, the following metrics will be added | |||
| 98 | 98 | ||
| 99 | | Metric | Type | Labels | Description | | 99 | | Metric | Type | Labels | Description | |
| 100 | |--------|------|--------|-------------| | 100 | |--------|------|--------|-------------| |
| 101 | | `ngit_sync_relay_connected` | Gauge | relay | Connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected, 4=connected_degraded) | | 101 | | `ngit_sync_relay_connected` | Gauge | relay | Connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected, 4=connected_historic_sync_failures) | |
| 102 | | `ngit_sync_connection_attempts_total` | Counter | relay, result | Connection attempt outcomes | | 102 | | `ngit_sync_connection_attempts_total` | Counter | relay, result | Connection attempt outcomes | |
| 103 | | `ngit_sync_relay_status` | Gauge | relay | Health status (1=healthy, 2=disconnected, 3=degraded, 4=dead, 5=rate_limited) | | 103 | | `ngit_sync_relay_status` | Gauge | relay | Health status (1=healthy, 2=disconnected, 3=degraded, 4=dead, 5=rate_limited) | |
| 104 | | `ngit_sync_relay_failures` | Gauge | relay | Current consecutive failure count | | 104 | | `ngit_sync_relay_failures` | Gauge | relay | Current consecutive failure count | |
| @@ -115,9 +115,9 @@ The `ngit_sync_relay_connected` metric tracks the connection lifecycle: | |||
| 115 | - `1` = **Connecting** - Connection attempt in progress | 115 | - `1` = **Connecting** - Connection attempt in progress |
| 116 | - `2` = **Syncing** - Connected, historic sync in progress | 116 | - `2` = **Syncing** - Connected, historic sync in progress |
| 117 | - `3` = **Connected** - Connected, historic sync complete, live sync active | 117 | - `3` = **Connected** - Connected, historic sync complete, live sync active |
| 118 | - `4` = **ConnectedDegraded** - Connected, historic sync failed, live sync active, partial data | 118 | - `4` = **ConnectedHistoricSyncFailures** - Connected, historic sync had failures, live sync active, partial data |
| 119 | 119 | ||
| 120 | This allows operators to distinguish between "connected but still catching up" (Syncing) vs "fully synced and live" (Connected) vs "degraded - missing historic data" (ConnectedDegraded). | 120 | This allows operators to distinguish between "connected but still catching up" (Syncing) vs "fully synced and live" (Connected) vs "historic sync failures - missing historic data" (ConnectedHistoricSyncFailures). |
| 121 | 121 | ||
| 122 | ### Relay Health States | 122 | ### Relay Health States |
| 123 | 123 | ||
| @@ -137,12 +137,12 @@ sum by (relay) (ngit_sync_relay_connected == 0) # Disconnected | |||
| 137 | sum by (relay) (ngit_sync_relay_connected == 1) # Connecting | 137 | sum by (relay) (ngit_sync_relay_connected == 1) # Connecting |
| 138 | sum by (relay) (ngit_sync_relay_connected == 2) # Syncing | 138 | sum by (relay) (ngit_sync_relay_connected == 2) # Syncing |
| 139 | sum by (relay) (ngit_sync_relay_connected == 3) # Connected | 139 | sum by (relay) (ngit_sync_relay_connected == 3) # Connected |
| 140 | sum by (relay) (ngit_sync_relay_connected == 4) # ConnectedDegraded | 140 | sum by (relay) (ngit_sync_relay_connected == 4) # ConnectedHistoricSyncFailures |
| 141 | 141 | ||
| 142 | # Relays still syncing (not yet fully caught up) | 142 | # Relays still syncing (not yet fully caught up) |
| 143 | count(ngit_sync_relay_connected == 2) | 143 | count(ngit_sync_relay_connected == 2) |
| 144 | 144 | ||
| 145 | # Relays with degraded sync (missing historic data) | 145 | # Relays with historic sync failures (missing historic data) |
| 146 | count(ngit_sync_relay_connected == 4) | 146 | count(ngit_sync_relay_connected == 4) |
| 147 | 147 | ||
| 148 | # Connection success rate over last hour | 148 | # Connection success rate over last hour |
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs index 0f56911..7907d8e 100644 --- a/src/sync/metrics.rs +++ b/src/sync/metrics.rs | |||
| @@ -53,7 +53,7 @@ impl SyncMetrics { | |||
| 53 | let relay_connected = IntGaugeVec::new( | 53 | let relay_connected = IntGaugeVec::new( |
| 54 | Opts::new( | 54 | Opts::new( |
| 55 | "ngit_sync_relay_connected", | 55 | "ngit_sync_relay_connected", |
| 56 | "Relay connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected, 4=connected_degraded)", | 56 | "Relay connection status (0=disconnected, 1=connecting, 2=syncing, 3=connected, 4=connected_historic_sync_failures)", |
| 57 | ), | 57 | ), |
| 58 | &["relay"], | 58 | &["relay"], |
| 59 | )?; | 59 | )?; |
| @@ -208,7 +208,7 @@ impl SyncMetrics { | |||
| 208 | /// - Connecting = 1 (connection attempt in progress) | 208 | /// - Connecting = 1 (connection attempt in progress) |
| 209 | /// - Syncing = 2 (connected, historic sync in progress) | 209 | /// - Syncing = 2 (connected, historic sync in progress) |
| 210 | /// - Connected = 3 (connected, historic sync complete) | 210 | /// - Connected = 3 (connected, historic sync complete) |
| 211 | /// - ConnectedDegraded = 4 (connected, historic sync failed but live sync active) | 211 | /// - ConnectedHistoricSyncFailures = 4 (connected, historic sync had failures but live sync active) |
| 212 | /// | 212 | /// |
| 213 | /// This is separate from health state and provides more granular connection lifecycle tracking. | 213 | /// This is separate from health state and provides more granular connection lifecycle tracking. |
| 214 | /// | 214 | /// |
| @@ -223,7 +223,7 @@ impl SyncMetrics { | |||
| 223 | ConnectionStatus::Connecting => 1, | 223 | ConnectionStatus::Connecting => 1, |
| 224 | ConnectionStatus::Syncing => 2, | 224 | ConnectionStatus::Syncing => 2, |
| 225 | ConnectionStatus::Connected => 3, | 225 | ConnectionStatus::Connected => 3, |
| 226 | ConnectionStatus::ConnectedDegraded => 4, | 226 | ConnectionStatus::ConnectedHistoricSyncFailures => 4, |
| 227 | }; | 227 | }; |
| 228 | self.relay_connected | 228 | self.relay_connected |
| 229 | .with_label_values(&[relay]) | 229 | .with_label_values(&[relay]) |
diff --git a/src/sync/mod.rs b/src/sync/mod.rs index 2031ef4..0e5b9bb 100644 --- a/src/sync/mod.rs +++ b/src/sync/mod.rs | |||
| @@ -94,8 +94,8 @@ pub enum ConnectionStatus { | |||
| 94 | Syncing, | 94 | Syncing, |
| 95 | /// Successfully connected, historic sync completed | 95 | /// Successfully connected, historic sync completed |
| 96 | Connected, | 96 | Connected, |
| 97 | /// Successfully connected, historic sync failed but live sync active | 97 | /// Successfully connected, historic sync had failures but live sync active |
| 98 | ConnectedDegraded, | 98 | ConnectedHistoricSyncFailures, |
| 99 | } | 99 | } |
| 100 | 100 | ||
| 101 | impl ConnectionStatus { | 101 | impl ConnectionStatus { |
| @@ -103,7 +103,7 @@ impl ConnectionStatus { | |||
| 103 | pub fn is_live_sync_active(&self) -> bool { | 103 | pub fn is_live_sync_active(&self) -> bool { |
| 104 | matches!( | 104 | matches!( |
| 105 | self, | 105 | self, |
| 106 | ConnectionStatus::Syncing | ConnectionStatus::Connected | ConnectionStatus::ConnectedDegraded | 106 | ConnectionStatus::Syncing | ConnectionStatus::Connected | ConnectionStatus::ConnectedHistoricSyncFailures |
| 107 | ) | 107 | ) |
| 108 | } | 108 | } |
| 109 | } | 109 | } |
| @@ -877,7 +877,7 @@ impl SyncManager { | |||
| 877 | tracing::warn!( | 877 | tracing::warn!( |
| 878 | relay = %relay_url, | 878 | relay = %relay_url, |
| 879 | batch_id = batch_id, | 879 | batch_id = batch_id, |
| 880 | "Batch failed - will transition to ConnectedDegraded instead of Connected" | 880 | "Batch failed - will transition to ConnectedHistoricSyncFailures instead of Connected" |
| 881 | ); | 881 | ); |
| 882 | } | 882 | } |
| 883 | 883 | ||
| @@ -963,7 +963,7 @@ impl SyncManager { | |||
| 963 | if state.connection_status == ConnectionStatus::Syncing { | 963 | if state.connection_status == ConnectionStatus::Syncing { |
| 964 | // Check if any batches failed during historic sync | 964 | // Check if any batches failed during historic sync |
| 965 | let new_status = if state.historic_sync_had_failures { | 965 | let new_status = if state.historic_sync_had_failures { |
| 966 | ConnectionStatus::ConnectedDegraded | 966 | ConnectionStatus::ConnectedHistoricSyncFailures |
| 967 | } else { | 967 | } else { |
| 968 | ConnectionStatus::Connected | 968 | ConnectionStatus::Connected |
| 969 | }; | 969 | }; |
| @@ -979,7 +979,7 @@ impl SyncManager { | |||
| 979 | had_failures = state.historic_sync_had_failures, | 979 | had_failures = state.historic_sync_had_failures, |
| 980 | status = ?new_status, | 980 | status = ?new_status, |
| 981 | "Historic sync complete - transitioned to {} status", | 981 | "Historic sync complete - transitioned to {} status", |
| 982 | if state.historic_sync_had_failures { "ConnectedDegraded" } else { "Connected" } | 982 | if state.historic_sync_had_failures { "ConnectedHistoricSyncFailures" } else { "Connected" } |
| 983 | ); | 983 | ); |
| 984 | 984 | ||
| 985 | // Update metrics | 985 | // Update metrics |