diff options
Diffstat (limited to 'docs')
| -rw-r--r-- | docs/explanation/grasp-02-proactive-sync.md | 98 |
1 files changed, 85 insertions, 13 deletions
diff --git a/docs/explanation/grasp-02-proactive-sync.md b/docs/explanation/grasp-02-proactive-sync.md index 34b7bb6..0607610 100644 --- a/docs/explanation/grasp-02-proactive-sync.md +++ b/docs/explanation/grasp-02-proactive-sync.md | |||
| @@ -64,6 +64,9 @@ pub struct RelayState { | |||
| 64 | pub last_connected: Option<Timestamp>, | 64 | pub last_connected: Option<Timestamp>, |
| 65 | /// When we disconnected - for 15-minute state retention rule | 65 | /// When we disconnected - for 15-minute state retention rule |
| 66 | pub disconnected_at: Option<Timestamp>, | 66 | pub disconnected_at: Option<Timestamp>, |
| 67 | /// Whether announcement filter historic sync has completed for this relay | ||
| 68 | /// Used to determine if we can use `since` filter on reconnect for Layer 1 | ||
| 69 | pub announcements_synced: bool, | ||
| 67 | } | 70 | } |
| 68 | 71 | ||
| 69 | impl RelayState { | 72 | impl RelayState { |
| @@ -109,7 +112,7 @@ pub struct PendingBatch { | |||
| 109 | /// The items this batch is syncing | 112 | /// The items this batch is syncing |
| 110 | pub items: PendingItems, | 113 | pub items: PendingItems, |
| 111 | /// Subscription IDs that must ALL receive EOSE before confirming (for ReqEose) | 114 | /// Subscription IDs that must ALL receive EOSE before confirming (for ReqEose) |
| 112 | /// Empty for Negentropy sync method | 115 | /// Empty for Negentropy sync method until missing event ids identified |
| 113 | pub outstanding_subs: HashSet<SubscriptionId>, | 116 | pub outstanding_subs: HashSet<SubscriptionId>, |
| 114 | /// The sync method used for this batch | 117 | /// The sync method used for this batch |
| 115 | pub sync_method: SyncMethod, | 118 | pub sync_method: SyncMethod, |
| @@ -124,21 +127,63 @@ pub struct PendingItems { | |||
| 124 | 127 | ||
| 125 | --- | 128 | --- |
| 126 | 129 | ||
| 127 | ## Connection Lifecycle State Machine | 130 | ## Connection Lifecycle |
| 131 | |||
| 132 | ### Object vs Connection Lifecycle | ||
| 133 | |||
| 134 | **Key Principle**: RelayConnection objects persist forever, WebSocket connections are transient. | ||
| 135 | |||
| 136 | - **RelayConnection object**: Created once via `register_relay()`, stored in HashMap permanently | ||
| 137 | - **WebSocket connection**: Transient, established via `try_connect_relay()`, dies on disconnect | ||
| 138 | - **Event loop**: Spawned by `handle_connect_or_reconnect()`, must be respawned after every reconnection | ||
| 139 | |||
| 140 | ### Connection State Machine | ||
| 128 | 141 | ||
| 129 | ```mermaid | 142 | ```mermaid |
| 130 | stateDiagram-v2 | 143 | stateDiagram-v2 |
| 131 | [*] --> Disconnected: discover relay via RepoSyncIndex | 144 | [*] --> Disconnected: discover relay → register_relay() |
| 132 | Disconnected --> Connecting: AddFilters triggers spawn_connection | 145 | Disconnected --> Connecting: retry_disconnected_relays → try_connect_relay |
| 133 | Connecting --> Connected: success | 146 | Connecting --> Connected: success → handle_connect_or_reconnect |
| 134 | Connecting --> Disconnected: failure + record in health tracker | 147 | Connecting --> Disconnected: failure + record in health tracker |
| 135 | Connected --> Disconnected: connection lost | 148 | Connected --> Disconnected: connection lost → handle_disconnect |
| 136 | Connected --> [*]: intentional disconnect via check_disconnects | 149 | Connected --> [*]: intentional disconnect via check_disconnects |
| 137 | 150 | ||
| 138 | note right of Disconnected: disconnected_at set for 15min rule | 151 | note right of Disconnected: disconnected_at set for 15min rule<br/>RelayConnection kept in HashMap |
| 139 | note right of Connected: last_connected tracked for since filter | 152 | note right of Connected: last_connected tracked for since filter<br/>Event loop spawned here |
| 153 | note right of Connecting: connection attempt with timeout | ||
| 154 | ``` | ||
| 155 | |||
| 156 | ### Connection Flow Methods | ||
| 157 | |||
| 158 | | Method | Purpose | When Called | Actions | | ||
| 159 | |--------|---------|-------------|---------| | ||
| 160 | | `register_relay()` | Initialize relay tracking | Discovery via RepoSyncIndex | Creates RelayConnection, stores in HashMap, returns immediately | | ||
| 161 | | `try_connect_relay()` | Attempt connection | Periodic retry (500ms) | Calls connect_and_subscribe, sends notification on success | | ||
| 162 | | `handle_connect_or_reconnect()` | Setup after connection | ConnectNotification received | Spawns event loop, updates state, decides sync strategy | | ||
| 163 | | `handle_disconnect()` | Cleanup after disconnect | DisconnectNotification received | Updates state, clears pending, KEEPS RelayConnection | | ||
| 164 | | `retry_disconnected_relays()` | Periodic reconnection | Every 500ms | For each ready relay: try_connect_relay() | | ||
| 165 | |||
| 166 | ### Event Loop Lifecycle | ||
| 167 | |||
| 168 | **Critical**: Event loops die on disconnect and cannot be reused. | ||
| 169 | |||
| 170 | ```mermaid | ||
| 171 | flowchart LR | ||
| 172 | CONN[Connection Success] --> SPAWN[handle_connect_or_reconnect<br/>spawns event loop] | ||
| 173 | SPAWN --> RUN[run_event_loop active] | ||
| 174 | RUN --> DISC[Disconnect detected] | ||
| 175 | DISC --> EXIT[Event loop breaks + task exits] | ||
| 176 | EXIT --> RETRY[retry_disconnected_relays] | ||
| 177 | RETRY --> RECONN[try_connect_relay] | ||
| 178 | RECONN --> |success| SPAWN | ||
| 140 | ``` | 179 | ``` |
| 141 | 180 | ||
| 181 | **Why respawn is required**: | ||
| 182 | - `run_event_loop()` breaks on RelayStatus::Disconnected | ||
| 183 | - The spawned task completely exits | ||
| 184 | - Cannot resume terminated task - must spawn fresh | ||
| 185 | - Happens for both initial connection AND every reconnect | ||
| 186 | |||
| 142 | --- | 187 | --- |
| 143 | 188 | ||
| 144 | ## Core Architecture: Live vs Historic Sync | 189 | ## Core Architecture: Live vs Historic Sync |
| @@ -212,7 +257,19 @@ flowchart TB | |||
| 212 | 257 | ||
| 213 | ```mermaid | 258 | ```mermaid |
| 214 | flowchart TB | 259 | flowchart TB |
| 215 | START[fresh_start called] --> CLEAR_PSI[Clear PendingSyncIndex] | 260 | DISC[Relay discovered via RepoSyncIndex] --> REG[register_relay] |
| 261 | REG --> CREATE[Create RelayConnection, store in HashMap] | ||
| 262 | CREATE --> RET[Returns immediately] | ||
| 263 | RET --> LOOP[retry_disconnected_relays - 500ms periodic] | ||
| 264 | LOOP --> CHECK[health_tracker.should_attempt_connection?] | ||
| 265 | CHECK --> |ready| TRY[try_connect_relay] | ||
| 266 | TRY --> CONN[connection.connect_and_subscribe] | ||
| 267 | CONN --> |success| NOTIFY[Send ConnectNotification] | ||
| 268 | NOTIFY --> HANDLE[handle_connect_or_reconnect called] | ||
| 269 | HANDLE --> UPD[Update state to Connected] | ||
| 270 | UPD --> SPAWN[Spawn event loop + processor] | ||
| 271 | SPAWN --> STRAT[Decide strategy: fresh_start] | ||
| 272 | STRAT --> CLEAR_PSI[Clear PendingSyncIndex] | ||
| 216 | CLEAR_PSI --> CLEAR_RSI[Clear RelaySyncIndex] | 273 | CLEAR_PSI --> CLEAR_RSI[Clear RelaySyncIndex] |
| 217 | CLEAR_RSI --> L1_LIVE[L1: sync_live - announcements] | 274 | CLEAR_RSI --> L1_LIVE[L1: sync_live - announcements] |
| 218 | L1_LIVE --> L1_HIST[L1: historic_sync - no since] | 275 | L1_LIVE --> L1_HIST[L1: historic_sync - no since] |
| @@ -241,10 +298,25 @@ flowchart TB | |||
| 241 | 298 | ||
| 242 | ```mermaid | 299 | ```mermaid |
| 243 | flowchart TB | 300 | flowchart TB |
| 244 | DISC[Connection lost] --> MARK[Set disconnected_at = now] | 301 | DISC[Connection lost detected] --> LOOP_EXIT[Event loop breaks] |
| 245 | MARK --> WAIT[Wait for reconnection < 15min] | 302 | LOOP_EXIT --> TASK_EXIT[Event processor task exits] |
| 246 | WAIT --> RECONN[Connection restored] | 303 | TASK_EXIT --> NOTIFY_DISC[Send DisconnectNotification] |
| 247 | RECONN --> CLEAR_PSI[Clear PendingSyncIndex] | 304 | NOTIFY_DISC --> HANDLE_DISC[handle_disconnect called] |
| 305 | HANDLE_DISC --> UPD_STATE[Update state to Disconnected] | ||
| 306 | UPD_STATE --> MARK[Set disconnected_at = now] | ||
| 307 | MARK --> CLEAR[Clear pending batches] | ||
| 308 | CLEAR --> KEEP[Keep RelayConnection in HashMap] | ||
| 309 | KEEP --> WAIT[Wait < 15min] | ||
| 310 | WAIT --> RETRY[retry_disconnected_relays - 500ms] | ||
| 311 | RETRY --> CHECK[health_tracker checks backoff] | ||
| 312 | CHECK --> |ready| TRY[try_connect_relay] | ||
| 313 | TRY --> CONN[connection.connect_and_subscribe] | ||
| 314 | CONN --> |success| NOTIFY[Send ConnectNotification] | ||
| 315 | NOTIFY --> RECONN[handle_connect_or_reconnect] | ||
| 316 | RECONN --> UPD_CONN[Update state to Connected] | ||
| 317 | UPD_CONN --> SPAWN[Spawn NEW event loop + processor] | ||
| 318 | SPAWN --> STRAT[Decide strategy: quick_reconnect] | ||
| 319 | STRAT --> CLEAR_PSI[Clear PendingSyncIndex] | ||
| 248 | CLEAR_PSI --> L1_LIVE[L1: sync_live - announcements] | 320 | CLEAR_PSI --> L1_LIVE[L1: sync_live - announcements] |
| 249 | L1_LIVE --> L1_HIST[L1: historic_sync WITH since] | 321 | L1_LIVE --> L1_HIST[L1: historic_sync WITH since] |
| 250 | L1_HIST --> RECON[reconstruct_filters from RelaySyncIndex] | 322 | L1_HIST --> RECON[reconstruct_filters from RelaySyncIndex] |