diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 16:54:38 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 16:54:38 +0000 |
| commit | fdbc8895e1e9e712882bd854908295a95e7afcb9 (patch) | |
| tree | 01a22f9d4b412d0099702afdd9272af2b7be3de5 | |
| parent | 8c129a4aeab3288f8193ccb820adf00860c50d74 (diff) | |
docs: update GRASP-02 proactive sync event sync approach
| -rw-r--r-- | docs/explanation/grasp-02-proactive-sync.md | 336 | ||||
| -rw-r--r-- | docs/explanation/monitoring.md | 96 |
2 files changed, 350 insertions, 82 deletions
diff --git a/docs/explanation/grasp-02-proactive-sync.md b/docs/explanation/grasp-02-proactive-sync.md index 250aece..a8af3f4 100644 --- a/docs/explanation/grasp-02-proactive-sync.md +++ b/docs/explanation/grasp-02-proactive-sync.md | |||
| @@ -23,6 +23,7 @@ flowchart TB | |||
| 23 | RH[RelayHealthTracker] | 23 | RH[RelayHealthTracker] |
| 24 | DB[(Database)] | 24 | DB[(Database)] |
| 25 | AP[AcceptancePolicy] | 25 | AP[AcceptancePolicy] |
| 26 | MET[Prometheus Metrics] | ||
| 26 | end | 27 | end |
| 27 | 28 | ||
| 28 | subgraph External Relays | 29 | subgraph External Relays |
| @@ -40,7 +41,7 @@ flowchart TB | |||
| 40 | SM <-->|WebSocket + NEG| R2 | 41 | SM <-->|WebSocket + NEG| R2 |
| 41 | SM <-->|WebSocket + NEG| R3 | 42 | SM <-->|WebSocket + NEG| R3 |
| 42 | 43 | ||
| 43 | RH -->|persists state| DB | 44 | RH -->|exposes state| MET |
| 44 | ``` | 45 | ``` |
| 45 | 46 | ||
| 46 | ## Connection Management | 47 | ## Connection Management |
| @@ -87,28 +88,79 @@ stateDiagram-v2 | |||
| 87 | | State | Behavior | | 88 | | State | Behavior | |
| 88 | | ----------- | --------------------------------------------------- | | 89 | | ----------- | --------------------------------------------------- | |
| 89 | | **Healthy** | Normal operation, immediate reconnect on disconnect | | 90 | | **Healthy** | Normal operation, immediate reconnect on disconnect | |
| 90 | | **Backoff** | Exponential backoff: 1s → 2s → 4s → ... → 1h max | | 91 | | **Backoff** | Exponential backoff: 5s → 10s → 20s → ... → 1h max | |
| 91 | | **Dead** | 24h of continuous failures, retry once per day | | 92 | | **Dead** | 24h of continuous failures, retry once per day | |
| 92 | 93 | ||
| 93 | Health state is **persisted to database** to survive restarts: | 94 | Health state is **kept in-memory** using a `DashMap` for lock-free concurrent access: |
| 94 | 95 | ||
| 95 | ```rust | 96 | ```rust |
| 97 | /// In-memory relay health tracking (NOT persisted to database) | ||
| 98 | /// | ||
| 99 | /// Design rationale: For <100 relays, persistence adds complexity without | ||
| 100 | /// significant benefit. Conservative initial backoff on restart avoids | ||
| 101 | /// thundering herd issues. | ||
| 102 | struct RelayHealthTracker { | ||
| 103 | health: DashMap<RelayUrl, RelayHealth>, | ||
| 104 | metrics: SyncMetrics, // Prometheus metrics for operator visibility | ||
| 105 | } | ||
| 106 | |||
| 96 | struct RelayHealth { | 107 | struct RelayHealth { |
| 97 | url: RelayUrl, | 108 | url: RelayUrl, |
| 98 | status: RelayStatus, // Healthy, Backoff, Dead | 109 | status: RelayStatus, // Healthy, Backoff, Dead |
| 99 | consecutive_failures: u32, | 110 | consecutive_failures: u32, |
| 100 | last_failure_at: Option<Timestamp>, | 111 | last_failure_at: Option<Instant>, |
| 101 | last_success_at: Option<Timestamp>, | 112 | last_success_at: Option<Instant>, |
| 102 | next_retry_at: Timestamp, | 113 | next_retry_at: Instant, |
| 103 | } | 114 | } |
| 104 | 115 | ||
| 105 | enum RelayStatus { | 116 | enum RelayStatus { |
| 106 | Healthy, | 117 | Healthy, |
| 107 | Backoff { attempt: u32 }, // backoff = min(2^attempt, 3600) seconds | 118 | Backoff { attempt: u32 }, // backoff = min(5 * 2^attempt, 3600) seconds |
| 108 | Dead, // retry in 24h | 119 | Dead, // retry in 24h |
| 109 | } | 120 | } |
| 110 | ``` | 121 | ``` |
| 111 | 122 | ||
| 123 | ### Restart Behavior (Graceful Degradation) | ||
| 124 | |||
| 125 | On restart, all relay health state is reset. To avoid thundering herd: | ||
| 126 | |||
| 127 | 1. **Conservative initial backoff**: Start with 5s delay (not immediate) for all relays | ||
| 128 | 2. **Staggered connection attempts**: Add random jitter (0-2s) per relay | ||
| 129 | 3. **Health rebuilds organically**: Relays prove themselves healthy through successful connections | ||
| 130 | |||
| 131 | ```rust | ||
| 132 | impl RelayHealthTracker { | ||
| 133 | fn new(metrics: SyncMetrics) -> Self { | ||
| 134 | Self { | ||
| 135 | health: DashMap::new(), | ||
| 136 | metrics, | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | /// Called on startup for each discovered relay | ||
| 141 | fn initialize_relay(&self, url: RelayUrl) { | ||
| 142 | self.health.insert(url.clone(), RelayHealth { | ||
| 143 | url, | ||
| 144 | status: RelayStatus::Backoff { attempt: 0 }, // Start conservative | ||
| 145 | consecutive_failures: 0, | ||
| 146 | last_failure_at: None, | ||
| 147 | last_success_at: None, | ||
| 148 | next_retry_at: Instant::now() + Self::initial_backoff_with_jitter(), | ||
| 149 | }); | ||
| 150 | } | ||
| 151 | |||
| 152 | fn initial_backoff_with_jitter() -> Duration { | ||
| 153 | Duration::from_secs(5) + Duration::from_millis(rand::random::<u64>() % 2000) | ||
| 154 | } | ||
| 155 | } | ||
| 156 | ``` | ||
| 157 | |||
| 158 | **Trade-off**: We lose knowledge of chronically failing relays across restarts. This is acceptable because: | ||
| 159 | |||
| 160 | - Scale is small (<100 relays) | ||
| 161 | - Conservative initial backoff prevents hammering bad relays | ||
| 162 | - Prometheus metrics preserve historical health data for operators | ||
| 163 | |||
| 112 | ## Filter Strategy | 164 | ## Filter Strategy |
| 113 | 165 | ||
| 114 | ### Unified Filters for Live Sync and Negentropy | 166 | ### Unified Filters for Live Sync and Negentropy |
| @@ -193,6 +245,23 @@ fn build_filters(tag_values: Vec<String>) -> Vec<Filter> { | |||
| 193 | 245 | ||
| 194 | **Consolidation**: When total filter count exceeds ~150 across a connection, consolidate by rebuilding from scratch. | 246 | **Consolidation**: When total filter count exceeds ~150 across a connection, consolidate by rebuilding from scratch. |
| 195 | 247 | ||
| 248 | ### Filter Generation vs. Policy Validation | ||
| 249 | |||
| 250 | The filter strategy and acceptance policies serve **different purposes** even though they share conceptual knowledge: | ||
| 251 | |||
| 252 | | Concern | Filters | Policies | | ||
| 253 | |---------|---------|----------| | ||
| 254 | | **Direction** | What to request FROM remote relays | What to accept INTO local database | | ||
| 255 | | **Input** | Stored events (announcements, PRs, etc.) | Single incoming event | | ||
| 256 | | **Output** | Filter specification | Accept/Reject decision | | ||
| 257 | |||
| 258 | The modular sub-policies ([`AnnouncementPolicy`](../../src/nostr/policy/announcement.rs:24), [`RelatedEventPolicy`](../../src/nostr/policy/related.rs:25), etc.) encode knowledge about event kinds and tag types, but this knowledge is applied differently: | ||
| 259 | |||
| 260 | - **In filters**: We enumerate **all** addressable refs (`30617:pubkey:id`) from stored announcements | ||
| 261 | - **In policies**: [`RelatedEventPolicy::check_references()`](../../src/nostr/policy/related.rs:39) checks if incoming event references **any** accepted event | ||
| 262 | |||
| 263 | Because of this fundamental difference, filter generation logic stays in `src/sync/filter.rs` rather than being delegated to policy modules. Both share the understanding of NIP-34 event relationships, but they answer different questions. | ||
| 264 | |||
| 196 | ## Subscription Updates | 265 | ## Subscription Updates |
| 197 | 266 | ||
| 198 | ### Dynamic Subscription Management | 267 | ### Dynamic Subscription Management |
| @@ -334,25 +403,59 @@ async fn schedule_daily_catchup(&self) { | |||
| 334 | 403 | ||
| 335 | ### Acceptance Policy | 404 | ### Acceptance Policy |
| 336 | 405 | ||
| 337 | All synced events go through our acceptance policy (same as direct submissions), but **excluding nostr-sdk defaults** like rate limiting: | 406 | All synced events go through our acceptance policy, reusing the same [`Nip34WritePolicy`](../../src/nostr/builder.rs:36) validation logic used for direct client submissions. |
| 407 | |||
| 408 | #### Design: Reusing admit_event() | ||
| 409 | |||
| 410 | The [`WritePolicy::admit_event()`](../../src/nostr/builder.rs:256-269) trait method takes a `SocketAddr` parameter designed for client connections: | ||
| 338 | 411 | ||
| 339 | ```rust | 412 | ```rust |
| 340 | async fn process_synced_event(&self, event: Event, source_relay: &RelayUrl) -> Result<()> { | 413 | // From nostr-relay-builder WritePolicy trait |
| 341 | // Skip nostr-sdk's built-in rate limiting - we trust our filter strategy | 414 | fn admit_event<'a>( |
| 342 | // and don't want to reject valid events just because they arrived quickly | 415 | &'a self, |
| 416 | event: &'a Event, | ||
| 417 | _addr: &'a SocketAddr, // Unused in our implementation | ||
| 418 | ) -> BoxedFuture<'a, PolicyResult>; | ||
| 419 | ``` | ||
| 420 | |||
| 421 | For synced events from remote relays, we pass a **synthetic localhost address** since: | ||
| 422 | 1. The `_addr` parameter is currently unused in our [`Nip34WritePolicy`](../../src/nostr/builder.rs:259) | ||
| 423 | 2. All meaningful validation is done by the modular sub-policies (see below) | ||
| 424 | 3. This allows reusing 100% of the existing validation logic | ||
| 425 | |||
| 426 | ```rust | ||
| 427 | use std::net::{IpAddr, Ipv4Addr, SocketAddr}; | ||
| 428 | |||
| 429 | /// Synthetic address for synced events (not from a direct client connection) | ||
| 430 | const SYNC_SOURCE_ADDR: SocketAddr = SocketAddr::new( | ||
| 431 | IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), | ||
| 432 | 0 | ||
| 433 | ); | ||
| 343 | 434 | ||
| 344 | // Apply our custom Nip34WritePolicy | 435 | async fn process_synced_event(&self, event: Event, source_relay: &RelayUrl) -> Result<()> { |
| 436 | // Apply our Nip34WritePolicy using synthetic address | ||
| 437 | // The SocketAddr is unused - all validation is by the modular sub-policies | ||
| 345 | let result = self.acceptance_policy | 438 | let result = self.acceptance_policy |
| 346 | .admit_event(&event, source_relay) | 439 | .admit_event(&event, &SYNC_SOURCE_ADDR) |
| 347 | .await; | 440 | .await; |
| 348 | 441 | ||
| 349 | match result { | 442 | match result { |
| 350 | PolicyResult::Accept => { | 443 | PolicyResult::Accept => { |
| 351 | self.database.save_event(&event).await?; | 444 | self.database.save_event(&event).await?; |
| 445 | tracing::debug!( | ||
| 446 | "Accepted synced event {} from {}", | ||
| 447 | event.id.to_hex(), | ||
| 448 | source_relay | ||
| 449 | ); | ||
| 352 | self.trigger_subscription_updates(&event).await; | 450 | self.trigger_subscription_updates(&event).await; |
| 353 | } | 451 | } |
| 354 | PolicyResult::Reject(reason) => { | 452 | PolicyResult::Reject(reason) => { |
| 355 | tracing::debug!("Rejected synced event {}: {}", event.id.to_hex(), reason); | 453 | tracing::debug!( |
| 454 | "Rejected synced event {} from {}: {}", | ||
| 455 | event.id.to_hex(), | ||
| 456 | source_relay, | ||
| 457 | reason | ||
| 458 | ); | ||
| 356 | } | 459 | } |
| 357 | } | 460 | } |
| 358 | 461 | ||
| @@ -360,6 +463,41 @@ async fn process_synced_event(&self, event: Event, source_relay: &RelayUrl) -> R | |||
| 360 | } | 463 | } |
| 361 | ``` | 464 | ``` |
| 362 | 465 | ||
| 466 | #### Modular Sub-Policies | ||
| 467 | |||
| 468 | The [`Nip34WritePolicy`](../../src/nostr/builder.rs:36-42) delegates to specialized sub-policies in [`src/nostr/policy/`](../../src/nostr/policy/mod.rs:1-41): | ||
| 469 | |||
| 470 | | Sub-Policy | Kinds | Responsibility | | ||
| 471 | |------------|-------|----------------| | ||
| 472 | | [`AnnouncementPolicy`](../../src/nostr/policy/announcement.rs:24-27) | 30617 | Validates service listing, maintainer exception, creates bare repos | | ||
| 473 | | [`StatePolicy`](../../src/nostr/policy/state.rs:43-46) | 30618 | Validates state structure, aligns git refs with authorized state | | ||
| 474 | | [`PrEventPolicy`](../../src/nostr/policy/pr_event.rs) | 1618, 1619 | Validates PR/PR Update events, manages refs/nostr/* | | ||
| 475 | | [`RelatedEventPolicy`](../../src/nostr/policy/related.rs:25-29) | All others | Checks forward/backward references to accepted repos/events | | ||
| 476 | |||
| 477 | All sub-policies share a common [`PolicyContext`](../../src/nostr/policy/mod.rs:22-27) containing: | ||
| 478 | - `domain`: Our service domain for validation | ||
| 479 | - `database`: For querying existing events | ||
| 480 | - `git_data_path`: For git operations | ||
| 481 | |||
| 482 | #### Why Not Call Sub-Policies Directly? | ||
| 483 | |||
| 484 | While we could bypass `admit_event()` and call sub-policies directly: | ||
| 485 | |||
| 486 | ```rust | ||
| 487 | // Alternative: Direct sub-policy calls (NOT recommended) | ||
| 488 | match event.kind.as_u16() { | ||
| 489 | 30617 => self.announcement_policy.validate(&event).await, | ||
| 490 | 30618 => self.state_policy.validate(&event), | ||
| 491 | 1618 | 1619 => self.pr_event_policy.validate_nostr_ref(&event).await, | ||
| 492 | _ => self.related_event_policy.check_references(&event).await, | ||
| 493 | } | ||
| 494 | ``` | ||
| 495 | |||
| 496 | This is **not recommended** because: | ||
| 497 | 1. Duplicates the kind-routing logic from [`admit_event()`](../../src/nostr/builder.rs:261-268) | ||
| 498 | 2. Misses important post-validation steps (e.g., `handle_announcement()` also calls `ensure_bare_repository()`) | ||
| 499 | 3. Creates maintenance burden when policy logic changes | ||
| 500 | |||
| 363 | ## Module Structure | 501 | ## Module Structure |
| 364 | 502 | ||
| 365 | ### New `src/sync/` Module | 503 | ### New `src/sync/` Module |
| @@ -402,77 +540,78 @@ async fn main() -> Result<()> { | |||
| 402 | 540 | ||
| 403 | ## Metrics & Observability | 541 | ## Metrics & Observability |
| 404 | 542 | ||
| 405 | ### Event Source Tracking | 543 | All sync metrics are exposed via Prometheus at `/metrics`. For <100 relays, per-relay labels are acceptable cardinality. |
| 406 | 544 | ||
| 407 | Track provenance of every event to measure live sync effectiveness: | 545 | ### Prometheus Metrics |
| 408 | 546 | ||
| 409 | ```rust | 547 | ```rust |
| 410 | enum EventSource { | 548 | /// Sync module metrics registered with the global Prometheus registry |
| 411 | DirectSubmission, // Sent directly to our relay by a user | 549 | pub struct SyncMetrics { |
| 412 | LiveSync(RelayUrl), // Received via live subscription | 550 | // === Connection Metrics (per relay) === |
| 413 | Catchup(RelayUrl), // Discovered during negentropy catchup | 551 | /// Active outbound connections: ngit_sync_relay_connected{relay="wss://..."} |
| 414 | DailyCatchup(RelayUrl), // Found during daily reconciliation | 552 | relay_connected: IntGaugeVec, // labels: [relay] |
| 415 | } | ||
| 416 | 553 | ||
| 417 | struct SyncMetrics { | 554 | /// Connection attempts: ngit_sync_connection_attempts_total{relay="wss://...", result="success|failure"} |
| 418 | /// Events by source | 555 | connection_attempts: CounterVec, // labels: [relay, result] |
| 419 | events_from_direct: Counter, | ||
| 420 | events_from_live_sync: Counter, | ||
| 421 | events_from_catchup: Counter, // Indicates live sync failure | ||
| 422 | events_from_daily_catchup: Counter, // Indicates sustained sync gap | ||
| 423 | 556 | ||
| 424 | /// Catchup gap tracking - events found that should have been live synced | 557 | // === Relay Health Status === |
| 425 | catchup_gap_total: Counter, | 558 | /// Current status: ngit_sync_relay_status{relay="wss://...", status="healthy|backoff|dead"} |
| 426 | catchup_gap_by_relay: HashMap<RelayUrl, Counter>, | 559 | relay_status: IntGaugeVec, // labels: [relay, status] |
| 427 | } | ||
| 428 | ``` | ||
| 429 | 560 | ||
| 430 | **Key insight**: Events discovered during catchup or daily reconciliation represent **live sync failures** - we should have received them in real-time. | 561 | /// Consecutive failures: ngit_sync_relay_failures{relay="wss://..."} |
| 562 | relay_failures: IntGaugeVec, // labels: [relay] | ||
| 431 | 563 | ||
| 432 | ### Peer Reliability Tracking | 564 | // === Event Source Tracking === |
| 565 | /// Events received by source: ngit_sync_events_total{source="direct|live_sync|catchup|daily_catchup"} | ||
| 566 | events_total: CounterVec, // labels: [source] | ||
| 433 | 567 | ||
| 434 | Track relay health and data completeness: | 568 | /// Sync gap events (should have been live synced): ngit_sync_gap_events_total{relay="wss://..."} |
| 569 | sync_gap_events: CounterVec, // labels: [relay] | ||
| 435 | 570 | ||
| 436 | ```rust | 571 | // === Aggregate Metrics === |
| 437 | struct PeerReliability { | 572 | /// Total relays being tracked |
| 438 | relay_url: RelayUrl, | 573 | relays_tracked_total: IntGauge, |
| 439 | |||
| 440 | // Connection metrics | ||
| 441 | connection_attempts: u64, | ||
| 442 | connection_failures: u64, | ||
| 443 | total_uptime_seconds: u64, | ||
| 444 | total_downtime_seconds: u64, | ||
| 445 | 574 | ||
| 446 | // Event coverage metrics (per repo we track) | 575 | /// Relays currently connected |
| 447 | repos_tracked: HashSet<RepoRef>, | 576 | relays_connected_total: IntGauge, |
| 448 | missing_events_detected: HashMap<RepoRef, u64>, // Events we have that they dont | ||
| 449 | events_received_from: u64, | ||
| 450 | 577 | ||
| 451 | // Calculated scores | 578 | /// Relays in dead state |
| 452 | uptime_percentage: f64, // uptime / (uptime + downtime) | 579 | relays_dead_total: IntGauge, |
| 453 | event_coverage_score: f64, // ratio of events we have vs what we expect from them | ||
| 454 | } | 580 | } |
| 455 | ``` | 581 | ``` |
| 456 | 582 | ||
| 583 | ### Metric Definitions | ||
| 584 | |||
| 585 | | Metric | Type | Labels | Description | | ||
| 586 | | ------------------------------------- | ------- | ------------- | ------------------------------------------------------ | | ||
| 587 | | `ngit_sync_relay_connected` | Gauge | relay | 1 if connected, 0 if not | | ||
| 588 | | `ngit_sync_connection_attempts_total` | Counter | relay, result | Connection attempt outcomes | | ||
| 589 | | `ngit_sync_relay_status` | Gauge | relay, status | 1 for current status, 0 otherwise | | ||
| 590 | | `ngit_sync_relay_failures` | Gauge | relay | Current consecutive failure count | | ||
| 591 | | `ngit_sync_events_total` | Counter | source | Events received by source type | | ||
| 592 | | `ngit_sync_gap_events_total` | Counter | relay | Events found during catchup that should have been live | | ||
| 593 | | `ngit_sync_relays_tracked_total` | Gauge | - | Total relays discovered from announcements | | ||
| 594 | | `ngit_sync_relays_connected_total` | Gauge | - | Currently connected relay count | | ||
| 595 | | `ngit_sync_relays_dead_total` | Gauge | - | Relays marked as dead | | ||
| 596 | |||
| 597 | **Key insight**: Events discovered during catchup or daily reconciliation represent **live sync failures** - we should have received them in real-time. The `ngit_sync_gap_events_total` metric tracks this per relay. | ||
| 598 | |||
| 457 | ### Observability Integration | 599 | ### Observability Integration |
| 458 | 600 | ||
| 459 | ```rust | 601 | ```rust |
| 460 | // Prometheus-style metrics | ||
| 461 | impl SyncManager { | 602 | impl SyncManager { |
| 462 | fn record_event_received(&self, event: &Event, source: EventSource) { | 603 | fn record_event_received(&self, event: &Event, source: EventSource) { |
| 463 | match source { | 604 | match source { |
| 464 | EventSource::DirectSubmission => { | 605 | EventSource::DirectSubmission => { |
| 465 | self.metrics.events_from_direct.inc(); | 606 | self.metrics.events_total.with_label_values(&["direct"]).inc(); |
| 466 | } | 607 | } |
| 467 | EventSource::LiveSync(relay) => { | 608 | EventSource::LiveSync(relay) => { |
| 468 | self.metrics.events_from_live_sync.inc(); | 609 | self.metrics.events_total.with_label_values(&["live_sync"]).inc(); |
| 469 | self.peer_metrics.get(&relay).events_received_from += 1; | ||
| 470 | } | 610 | } |
| 471 | EventSource::Catchup(relay) => { | 611 | EventSource::Catchup(relay) => { |
| 472 | // This is a sync gap - we should have gotten it via live sync | 612 | // This is a sync gap - we should have gotten it via live sync |
| 473 | self.metrics.events_from_catchup.inc(); | 613 | self.metrics.events_total.with_label_values(&["catchup"]).inc(); |
| 474 | self.metrics.catchup_gap_total.inc(); | 614 | self.metrics.sync_gap_events.with_label_values(&[relay.as_str()]).inc(); |
| 475 | self.metrics.catchup_gap_by_relay.entry(relay.clone()).or_default().inc(); | ||
| 476 | tracing::warn!( | 615 | tracing::warn!( |
| 477 | relay = %relay, | 616 | relay = %relay, |
| 478 | event_id = %event.id.to_hex(), | 617 | event_id = %event.id.to_hex(), |
| @@ -481,7 +620,8 @@ impl SyncManager { | |||
| 481 | } | 620 | } |
| 482 | EventSource::DailyCatchup(relay) => { | 621 | EventSource::DailyCatchup(relay) => { |
| 483 | // Sustained sync gap - missed by both live sync and initial catchup | 622 | // Sustained sync gap - missed by both live sync and initial catchup |
| 484 | self.metrics.events_from_daily_catchup.inc(); | 623 | self.metrics.events_total.with_label_values(&["daily_catchup"]).inc(); |
| 624 | self.metrics.sync_gap_events.with_label_values(&[relay.as_str()]).inc(); | ||
| 485 | tracing::error!( | 625 | tracing::error!( |
| 486 | relay = %relay, | 626 | relay = %relay, |
| 487 | event_id = %event.id.to_hex(), | 627 | event_id = %event.id.to_hex(), |
| @@ -492,15 +632,56 @@ impl SyncManager { | |||
| 492 | } | 632 | } |
| 493 | 633 | ||
| 494 | fn record_connection_attempt(&self, relay: &RelayUrl, success: bool) { | 634 | fn record_connection_attempt(&self, relay: &RelayUrl, success: bool) { |
| 495 | let peer = self.peer_metrics.entry(relay.clone()).or_default(); | 635 | let result = if success { "success" } else { "failure" }; |
| 496 | peer.connection_attempts += 1; | 636 | self.metrics.connection_attempts |
| 497 | if !success { | 637 | .with_label_values(&[relay.as_str(), result]) |
| 498 | peer.connection_failures += 1; | 638 | .inc(); |
| 639 | } | ||
| 640 | |||
| 641 | fn update_relay_status(&self, relay: &RelayUrl, status: &RelayStatus) { | ||
| 642 | // Reset all status labels for this relay | ||
| 643 | for s in ["healthy", "backoff", "dead"] { | ||
| 644 | self.metrics.relay_status | ||
| 645 | .with_label_values(&[relay.as_str(), s]) | ||
| 646 | .set(0); | ||
| 499 | } | 647 | } |
| 648 | // Set current status | ||
| 649 | let status_label = match status { | ||
| 650 | RelayStatus::Healthy => "healthy", | ||
| 651 | RelayStatus::Backoff { .. } => "backoff", | ||
| 652 | RelayStatus::Dead => "dead", | ||
| 653 | }; | ||
| 654 | self.metrics.relay_status | ||
| 655 | .with_label_values(&[relay.as_str(), status_label]) | ||
| 656 | .set(1); | ||
| 500 | } | 657 | } |
| 501 | } | 658 | } |
| 502 | ``` | 659 | ``` |
| 503 | 660 | ||
| 661 | ### Example Grafana Queries | ||
| 662 | |||
| 663 | ```promql | ||
| 664 | # Relay health overview - count by status | ||
| 665 | sum by (status) (ngit_sync_relay_status == 1) | ||
| 666 | |||
| 667 | # Connection success rate over last hour | ||
| 668 | sum(rate(ngit_sync_connection_attempts_total{result="success"}[1h])) | ||
| 669 | / sum(rate(ngit_sync_connection_attempts_total[1h])) | ||
| 670 | |||
| 671 | # Sync gap detection - events that should have been live synced | ||
| 672 | sum(rate(ngit_sync_gap_events_total[1h])) by (relay) | ||
| 673 | |||
| 674 | # Live sync effectiveness (lower is better - fewer gaps) | ||
| 675 | sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h])) | ||
| 676 | / sum(rate(ngit_sync_events_total[1h])) | ||
| 677 | |||
| 678 | # Relays with high failure counts (potential issues) | ||
| 679 | topk(10, ngit_sync_relay_failures) | ||
| 680 | |||
| 681 | # Alert: relay stuck in dead state | ||
| 682 | ngit_sync_relay_status{status="dead"} == 1 | ||
| 683 | ``` | ||
| 684 | |||
| 504 | ### Log Levels for Sync Events | 685 | ### Log Levels for Sync Events |
| 505 | 686 | ||
| 506 | | Event | Level | Context | | 687 | | Event | Level | Context | |
| @@ -544,22 +725,23 @@ pub struct SyncConfig { | |||
| 544 | 725 | ||
| 545 | ## Summary | 726 | ## Summary |
| 546 | 727 | ||
| 547 | | Component | Responsibility | | 728 | | Component | Responsibility | |
| 548 | | ---------------------- | ------------------------------------------------------------ | | 729 | | ---------------------- | -------------------------------------------------------------- | |
| 549 | | **SyncManager** | Orchestrates connections, triggers catchup, processes events | | 730 | | **SyncManager** | Orchestrates connections, triggers catchup, processes events | |
| 550 | | **FilterService** | Builds unified filters from database state | | 731 | | **FilterService** | Builds unified filters from database state | |
| 551 | | **RelayHealthTracker** | Manages backoff, dead relay detection, persistence | | 732 | | **RelayHealthTracker** | Manages backoff, dead relay detection (in-memory + Prometheus) | |
| 552 | | **ConnectionState** | Per-relay WebSocket + subscription management | | 733 | | **ConnectionState** | Per-relay WebSocket + subscription management | |
| 734 | | **SyncMetrics** | Prometheus metrics for operator visibility | | ||
| 553 | 735 | ||
| 554 | ### Key Design Decisions | 736 | ### Key Design Decisions |
| 555 | 737 | ||
| 556 | 1. **Unified filters** for live sync and negentropy - same criteria, different delivery mechanism | 738 | 1. **Unified filters** for live sync and negentropy - same criteria, different delivery mechanism |
| 557 | 2. **Exclude ourselves** from relay list to prevent loops | 739 | 2. **Exclude ourselves** from relay list to prevent loops |
| 558 | 3. **One connection per relay** with combined filters for efficiency | 740 | 3. **One connection per relay** with combined filters for efficiency |
| 559 | 4. **Persisted health state** survives restarts | 741 | 4. **In-memory health state** with Prometheus metrics for visibility (no database persistence needed for <100 relays) |
| 560 | 5. **Staggered catchup** to avoid overwhelming relays - runs immediately at startup after warm-up | 742 | 5. **Graceful degradation on restart** - conservative initial backoff with jitter avoids thundering herd |
| 561 | 6. **Client-side filtering** for 30617/30618, server-side for Layer 2/3 | 743 | 6. **Staggered catchup** to avoid overwhelming relays - runs immediately at startup after warm-up |
| 562 | 7. **Dynamic subscription addition** with periodic consolidation | 744 | 7. **Client-side filtering** for 30617/30618, server-side for Layer 2/3 |
| 563 | 8. **Custom acceptance policy** excluding rate limiting defaults | 745 | 8. **Dynamic subscription addition** with periodic consolidation |
| 564 | 9. **Catchup as failure signal** - events found during catchup/daily indicate live sync gaps | 746 | 9. **Custom acceptance policy** excluding rate limiting defaults |
| 565 | 10. **Peer reliability tracking** - monitor uptime and event coverage per relay | 747 | 10. **Catchup as failure signal** - events found during catchup/daily indicate live sync gaps, tracked in Prometheus |
diff --git a/docs/explanation/monitoring.md b/docs/explanation/monitoring.md index 3b1b1ac..9368bf4 100644 --- a/docs/explanation/monitoring.md +++ b/docs/explanation/monitoring.md | |||
| @@ -90,10 +90,96 @@ For detailed per-repository investigation at scale, consider adding **Loki** (lo | |||
| 90 | - Loki queries enable ad-hoc deep dives (e.g., find all transfers > 10MB) | 90 | - Loki queries enable ad-hoc deep dives (e.g., find all transfers > 10MB) |
| 91 | - Pairs with Prometheus for long-term trends | 91 | - Pairs with Prometheus for long-term trends |
| 92 | 92 | ||
| 93 | ## Future: Sync Metrics (GRASP-02) | 93 | ## Sync Metrics (GRASP-02) |
| 94 | 94 | ||
| 95 | When GRASP-02 proactive sync is implemented, additional metrics will track: | 95 | When GRASP-02 proactive sync is implemented, the following metrics will be added to track relay synchronization health. These metrics use in-memory tracking with Prometheus for operator visibility (no database persistence needed for <100 relays). |
| 96 | 96 | ||
| 97 | - Events received from sync (live vs catchup) | 97 | ### Sync Metrics Overview |
| 98 | - Active outbound relay connections | 98 | |
| 99 | - Catchup gap (events found during catchup indicating sync failures) \ No newline at end of file | 99 | | Metric | Type | Labels | Description | |
| 100 | |--------|------|--------|-------------| | ||
| 101 | | `ngit_sync_relay_connected` | Gauge | relay | 1 if connected, 0 if not | | ||
| 102 | | `ngit_sync_connection_attempts_total` | Counter | relay, result | Connection attempt outcomes | | ||
| 103 | | `ngit_sync_relay_status` | Gauge | relay, status | 1 for current status, 0 otherwise | | ||
| 104 | | `ngit_sync_relay_failures` | Gauge | relay | Current consecutive failure count | | ||
| 105 | | `ngit_sync_events_total` | Counter | source | Events received by source type | | ||
| 106 | | `ngit_sync_gap_events_total` | Counter | relay | Events found during catchup | | ||
| 107 | | `ngit_sync_relays_tracked_total` | Gauge | - | Total relays discovered | | ||
| 108 | | `ngit_sync_relays_connected_total` | Gauge | - | Currently connected relay count | | ||
| 109 | | `ngit_sync_relays_dead_total` | Gauge | - | Relays marked as dead | | ||
| 110 | |||
| 111 | ### Event Sources | ||
| 112 | |||
| 113 | The `source` label on `ngit_sync_events_total` tracks how events were received: | ||
| 114 | |||
| 115 | - `direct` - Submitted directly to our relay by a user | ||
| 116 | - `live_sync` - Received via live WebSocket subscription (expected path) | ||
| 117 | - `catchup` - Found during negentropy catchup after reconnect | ||
| 118 | - `daily_catchup` - Found during daily reconciliation | ||
| 119 | |||
| 120 | **Catchup events indicate sync failures** - these should have been received via live sync. High catchup rates suggest connectivity issues or filter mismatches. | ||
| 121 | |||
| 122 | ### Relay Health States | ||
| 123 | |||
| 124 | The `status` label on `ngit_sync_relay_status` tracks relay health: | ||
| 125 | |||
| 126 | - `healthy` - Normal operation, connections working | ||
| 127 | - `backoff` - Exponential backoff after failures (5s → 10s → ... → 1h) | ||
| 128 | - `dead` - 24h of continuous failures, daily retry only | ||
| 129 | |||
| 130 | ### Example Grafana Queries | ||
| 131 | |||
| 132 | ```promql | ||
| 133 | # Relay health overview - count by status | ||
| 134 | sum by (status) (ngit_sync_relay_status == 1) | ||
| 135 | |||
| 136 | # Connection success rate over last hour | ||
| 137 | sum(rate(ngit_sync_connection_attempts_total{result="success"}[1h])) | ||
| 138 | / sum(rate(ngit_sync_connection_attempts_total[1h])) | ||
| 139 | |||
| 140 | # Sync gap detection - events that should have been live synced | ||
| 141 | sum(rate(ngit_sync_gap_events_total[1h])) by (relay) | ||
| 142 | |||
| 143 | # Live sync effectiveness (lower is better - fewer gaps) | ||
| 144 | sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h])) | ||
| 145 | / sum(rate(ngit_sync_events_total[1h])) | ||
| 146 | |||
| 147 | # Relays with high failure counts (potential issues) | ||
| 148 | topk(10, ngit_sync_relay_failures) | ||
| 149 | ``` | ||
| 150 | |||
| 151 | ### Example Alerts | ||
| 152 | |||
| 153 | ```yaml | ||
| 154 | # Alert if relay stuck in dead state for > 1 day | ||
| 155 | - alert: SyncRelayDead | ||
| 156 | expr: ngit_sync_relay_status{status="dead"} == 1 | ||
| 157 | for: 1d | ||
| 158 | labels: | ||
| 159 | severity: warning | ||
| 160 | annotations: | ||
| 161 | summary: "Sync relay {{ $labels.relay }} is dead" | ||
| 162 | |||
| 163 | # Alert if sync gap rate is high (>10% of events from catchup) | ||
| 164 | - alert: SyncGapHigh | ||
| 165 | expr: > | ||
| 166 | sum(rate(ngit_sync_events_total{source=~"catchup|daily_catchup"}[1h])) | ||
| 167 | / sum(rate(ngit_sync_events_total[1h])) > 0.1 | ||
| 168 | for: 30m | ||
| 169 | labels: | ||
| 170 | severity: warning | ||
| 171 | annotations: | ||
| 172 | summary: "High sync gap rate - {{ $value | humanizePercentage }} of events from catchup" | ||
| 173 | ``` | ||
| 174 | |||
| 175 | ### Design Rationale | ||
| 176 | |||
| 177 | **In-memory health tracking with Prometheus visibility** was chosen over database persistence because: | ||
| 178 | |||
| 179 | 1. **Scale**: <100 relays means per-relay labels have acceptable cardinality | ||
| 180 | 2. **Simplicity**: No database schema, migrations, or cleanup needed | ||
| 181 | 3. **Operator visibility**: Prometheus + Grafana provide better dashboards than custom queries | ||
| 182 | 4. **Restart behavior**: Conservative initial backoff (5s + jitter) avoids thundering herd on restart | ||
| 183 | 5. **Historical data**: Prometheus retains health history; in-memory state only needs current status | ||
| 184 | |||
| 185 | See [GRASP-02 Proactive Sync](grasp-02-proactive-sync.md) for full architecture details. \ No newline at end of file | ||