diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 18:43:49 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 18:43:49 +0000 |
| commit | dd403b17e7c74db9443d0891a9de1f0f0f9f89eb (patch) | |
| tree | 177dd9f664dde3565492c1d11016dabfeda28bbc /src/sync/metrics.rs | |
| parent | 950c2e4e68448d2abcad90a31bfffaca6d7bc47e (diff) | |
feat(sync): Phase 6 - observability and production readiness
- Add SyncMetrics with full Prometheus integration
- Track sync gaps via catchup events
- Update Grafana dashboard with sync panels
- Document all sync configuration options
- Update design doc with implementation notes
Diffstat (limited to 'src/sync/metrics.rs')
| -rw-r--r-- | src/sync/metrics.rs | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs new file mode 100644 index 0000000..c93e583 --- /dev/null +++ b/src/sync/metrics.rs | |||
| @@ -0,0 +1,348 @@ | |||
| 1 | //! Prometheus Metrics for Proactive Sync (GRASP-02 Phase 6) | ||
| 2 | //! | ||
| 3 | //! This module provides comprehensive sync monitoring metrics including: | ||
| 4 | //! - Connection status and attempts per relay | ||
| 5 | //! - Health state tracking (Healthy/Degraded/Dead) | ||
| 6 | //! - Event sync tracking by source (live/startup/reconnect/daily catchup) | ||
| 7 | //! - Gap events filled during catchup operations | ||
| 8 | //! | ||
| 9 | //! All metrics follow the `ngit_sync_` prefix convention. | ||
| 10 | |||
| 11 | use prometheus::{IntCounterVec, IntGauge, IntGaugeVec, Opts, Registry}; | ||
| 12 | |||
| 13 | use super::health::HealthState; | ||
| 14 | |||
| 15 | /// Prometheus metrics for the proactive sync system | ||
| 16 | #[derive(Clone)] | ||
| 17 | pub struct SyncMetrics { | ||
| 18 | // === Connection metrics === | ||
| 19 | /// Per-relay connection status (1=connected, 0=disconnected) | ||
| 20 | relay_connected: IntGaugeVec, | ||
| 21 | /// Connection attempts by relay and result (success/failure) | ||
| 22 | connection_attempts_total: IntCounterVec, | ||
| 23 | |||
| 24 | // === Health metrics === | ||
| 25 | /// Per-relay health status (healthy=1, degraded=2, dead=3) | ||
| 26 | relay_status: IntGaugeVec, | ||
| 27 | /// Per-relay consecutive failure count | ||
| 28 | relay_failures: IntGaugeVec, | ||
| 29 | |||
| 30 | // === Event metrics === | ||
| 31 | /// Events synced by source (live/startup/reconnect/daily) | ||
| 32 | events_total: IntCounterVec, | ||
| 33 | /// Gap events filled during catchup, by relay | ||
| 34 | gap_events_total: IntCounterVec, | ||
| 35 | |||
| 36 | // === Summary metrics === | ||
| 37 | /// Total relays discovered and tracked | ||
| 38 | relays_tracked_total: IntGauge, | ||
| 39 | /// Currently connected relay count | ||
| 40 | relays_connected_total: IntGauge, | ||
| 41 | /// Relays marked as dead | ||
| 42 | relays_dead_total: IntGauge, | ||
| 43 | } | ||
| 44 | |||
| 45 | impl SyncMetrics { | ||
| 46 | /// Register all sync metrics with the provided Prometheus registry | ||
| 47 | pub fn register(registry: &Registry) -> Result<Self, prometheus::Error> { | ||
| 48 | // Connection metrics | ||
| 49 | let relay_connected = IntGaugeVec::new( | ||
| 50 | Opts::new( | ||
| 51 | "ngit_sync_relay_connected", | ||
| 52 | "Relay connection status (1=connected, 0=disconnected)", | ||
| 53 | ), | ||
| 54 | &["relay"], | ||
| 55 | )?; | ||
| 56 | registry.register(Box::new(relay_connected.clone()))?; | ||
| 57 | |||
| 58 | let connection_attempts_total = IntCounterVec::new( | ||
| 59 | Opts::new( | ||
| 60 | "ngit_sync_connection_attempts_total", | ||
| 61 | "Total connection attempts by relay and result", | ||
| 62 | ), | ||
| 63 | &["relay", "result"], | ||
| 64 | )?; | ||
| 65 | registry.register(Box::new(connection_attempts_total.clone()))?; | ||
| 66 | |||
| 67 | // Health metrics | ||
| 68 | let relay_status = IntGaugeVec::new( | ||
| 69 | Opts::new( | ||
| 70 | "ngit_sync_relay_status", | ||
| 71 | "Relay health status (1=healthy, 2=degraded, 3=dead)", | ||
| 72 | ), | ||
| 73 | &["relay"], | ||
| 74 | )?; | ||
| 75 | registry.register(Box::new(relay_status.clone()))?; | ||
| 76 | |||
| 77 | let relay_failures = IntGaugeVec::new( | ||
| 78 | Opts::new( | ||
| 79 | "ngit_sync_relay_failures", | ||
| 80 | "Consecutive failure count per relay", | ||
| 81 | ), | ||
| 82 | &["relay"], | ||
| 83 | )?; | ||
| 84 | registry.register(Box::new(relay_failures.clone()))?; | ||
| 85 | |||
| 86 | // Event metrics | ||
| 87 | let events_total = IntCounterVec::new( | ||
| 88 | Opts::new( | ||
| 89 | "ngit_sync_events_total", | ||
| 90 | "Total events synced by source type", | ||
| 91 | ), | ||
| 92 | &["source"], | ||
| 93 | )?; | ||
| 94 | registry.register(Box::new(events_total.clone()))?; | ||
| 95 | |||
| 96 | let gap_events_total = IntCounterVec::new( | ||
| 97 | Opts::new( | ||
| 98 | "ngit_sync_gap_events_total", | ||
| 99 | "Gap events filled during catchup by relay", | ||
| 100 | ), | ||
| 101 | &["relay"], | ||
| 102 | )?; | ||
| 103 | registry.register(Box::new(gap_events_total.clone()))?; | ||
| 104 | |||
| 105 | // Summary metrics | ||
| 106 | let relays_tracked_total = IntGauge::with_opts(Opts::new( | ||
| 107 | "ngit_sync_relays_tracked_total", | ||
| 108 | "Total number of relays discovered and tracked", | ||
| 109 | ))?; | ||
| 110 | registry.register(Box::new(relays_tracked_total.clone()))?; | ||
| 111 | |||
| 112 | let relays_connected_total = IntGauge::with_opts(Opts::new( | ||
| 113 | "ngit_sync_relays_connected_total", | ||
| 114 | "Number of currently connected relays", | ||
| 115 | ))?; | ||
| 116 | registry.register(Box::new(relays_connected_total.clone()))?; | ||
| 117 | |||
| 118 | let relays_dead_total = IntGauge::with_opts(Opts::new( | ||
| 119 | "ngit_sync_relays_dead_total", | ||
| 120 | "Number of relays marked as dead", | ||
| 121 | ))?; | ||
| 122 | registry.register(Box::new(relays_dead_total.clone()))?; | ||
| 123 | |||
| 124 | Ok(Self { | ||
| 125 | relay_connected, | ||
| 126 | connection_attempts_total, | ||
| 127 | relay_status, | ||
| 128 | relay_failures, | ||
| 129 | events_total, | ||
| 130 | gap_events_total, | ||
| 131 | relays_tracked_total, | ||
| 132 | relays_connected_total, | ||
| 133 | relays_dead_total, | ||
| 134 | }) | ||
| 135 | } | ||
| 136 | |||
| 137 | // === Connection Recording Methods === | ||
| 138 | |||
| 139 | /// Record a connection attempt (success or failure) | ||
| 140 | pub fn record_connection_attempt(&self, relay: &str, success: bool) { | ||
| 141 | let result = if success { "success" } else { "failure" }; | ||
| 142 | self.connection_attempts_total | ||
| 143 | .with_label_values(&[relay, result]) | ||
| 144 | .inc(); | ||
| 145 | } | ||
| 146 | |||
| 147 | /// Set relay connection status | ||
| 148 | pub fn set_relay_connected(&self, relay: &str, connected: bool) { | ||
| 149 | self.relay_connected | ||
| 150 | .with_label_values(&[relay]) | ||
| 151 | .set(if connected { 1 } else { 0 }); | ||
| 152 | |||
| 153 | // Update connected count based on all relay values | ||
| 154 | // This is handled by update_connected_count() for accuracy | ||
| 155 | } | ||
| 156 | |||
| 157 | /// Update the total connected relay count | ||
| 158 | pub fn update_connected_count(&self, count: i64) { | ||
| 159 | self.relays_connected_total.set(count); | ||
| 160 | } | ||
| 161 | |||
| 162 | /// Increment connected count | ||
| 163 | pub fn inc_connected_count(&self) { | ||
| 164 | self.relays_connected_total.inc(); | ||
| 165 | } | ||
| 166 | |||
| 167 | /// Decrement connected count | ||
| 168 | pub fn dec_connected_count(&self) { | ||
| 169 | self.relays_connected_total.dec(); | ||
| 170 | } | ||
| 171 | |||
| 172 | // === Health Recording Methods === | ||
| 173 | |||
| 174 | /// Record relay health state change | ||
| 175 | pub fn record_health_state(&self, relay: &str, state: HealthState) { | ||
| 176 | let state_value = match state { | ||
| 177 | HealthState::Healthy => 1, | ||
| 178 | HealthState::Degraded => 2, | ||
| 179 | HealthState::Dead => 3, | ||
| 180 | }; | ||
| 181 | self.relay_status.with_label_values(&[relay]).set(state_value); | ||
| 182 | } | ||
| 183 | |||
| 184 | /// Record relay failure count | ||
| 185 | pub fn record_failure_count(&self, relay: &str, count: u32) { | ||
| 186 | self.relay_failures | ||
| 187 | .with_label_values(&[relay]) | ||
| 188 | .set(count as i64); | ||
| 189 | } | ||
| 190 | |||
| 191 | /// Update dead relay count | ||
| 192 | pub fn update_dead_count(&self, count: i64) { | ||
| 193 | self.relays_dead_total.set(count); | ||
| 194 | } | ||
| 195 | |||
| 196 | /// Increment dead relay count | ||
| 197 | pub fn inc_dead_count(&self) { | ||
| 198 | self.relays_dead_total.inc(); | ||
| 199 | } | ||
| 200 | |||
| 201 | /// Decrement dead relay count | ||
| 202 | pub fn dec_dead_count(&self) { | ||
| 203 | self.relays_dead_total.dec(); | ||
| 204 | } | ||
| 205 | |||
| 206 | // === Event Recording Methods === | ||
| 207 | |||
| 208 | /// Record a synced event by source type | ||
| 209 | /// | ||
| 210 | /// Source types: | ||
| 211 | /// - "live" - Real-time subscription events | ||
| 212 | /// - "startup" - Events from startup catchup | ||
| 213 | /// - "reconnect" - Events from reconnection catchup | ||
| 214 | /// - "daily" - Events from daily catchup | ||
| 215 | pub fn record_event(&self, source: &str) { | ||
| 216 | self.events_total.with_label_values(&[source]).inc(); | ||
| 217 | } | ||
| 218 | |||
| 219 | /// Record multiple events synced by source type | ||
| 220 | pub fn record_events(&self, source: &str, count: u64) { | ||
| 221 | self.events_total | ||
| 222 | .with_label_values(&[source]) | ||
| 223 | .inc_by(count); | ||
| 224 | } | ||
| 225 | |||
| 226 | /// Record a gap event filled during catchup | ||
| 227 | pub fn record_gap_event(&self, relay: &str) { | ||
| 228 | self.gap_events_total.with_label_values(&[relay]).inc(); | ||
| 229 | } | ||
| 230 | |||
| 231 | /// Record multiple gap events filled during catchup | ||
| 232 | pub fn record_gap_events(&self, relay: &str, count: u64) { | ||
| 233 | self.gap_events_total | ||
| 234 | .with_label_values(&[relay]) | ||
| 235 | .inc_by(count); | ||
| 236 | } | ||
| 237 | |||
| 238 | // === Summary Recording Methods === | ||
| 239 | |||
| 240 | /// Set the total tracked relay count | ||
| 241 | pub fn set_tracked_count(&self, count: i64) { | ||
| 242 | self.relays_tracked_total.set(count); | ||
| 243 | } | ||
| 244 | |||
| 245 | /// Increment tracked relay count | ||
| 246 | pub fn inc_tracked_count(&self) { | ||
| 247 | self.relays_tracked_total.inc(); | ||
| 248 | } | ||
| 249 | |||
| 250 | /// Get current tracked relay count | ||
| 251 | pub fn get_tracked_count(&self) -> i64 { | ||
| 252 | self.relays_tracked_total.get() | ||
| 253 | } | ||
| 254 | |||
| 255 | /// Get current connected relay count | ||
| 256 | pub fn get_connected_count(&self) -> i64 { | ||
| 257 | self.relays_connected_total.get() | ||
| 258 | } | ||
| 259 | |||
| 260 | /// Get current dead relay count | ||
| 261 | pub fn get_dead_count(&self) -> i64 { | ||
| 262 | self.relays_dead_total.get() | ||
| 263 | } | ||
| 264 | } | ||
| 265 | |||
| 266 | /// Event source types for metrics tracking | ||
| 267 | pub mod event_source { | ||
| 268 | /// Real-time subscription events | ||
| 269 | pub const LIVE: &str = "live"; | ||
| 270 | /// Events from startup catchup | ||
| 271 | pub const STARTUP: &str = "startup"; | ||
| 272 | /// Events from reconnection catchup | ||
| 273 | pub const RECONNECT: &str = "reconnect"; | ||
| 274 | /// Events from daily catchup | ||
| 275 | pub const DAILY: &str = "daily"; | ||
| 276 | } | ||
| 277 | |||
| 278 | #[cfg(test)] | ||
| 279 | mod tests { | ||
| 280 | use super::*; | ||
| 281 | |||
| 282 | fn create_test_registry() -> Registry { | ||
| 283 | Registry::new() | ||
| 284 | } | ||
| 285 | |||
| 286 | #[test] | ||
| 287 | fn test_metrics_registration() { | ||
| 288 | let registry = create_test_registry(); | ||
| 289 | let metrics = SyncMetrics::register(®istry); | ||
| 290 | assert!(metrics.is_ok()); | ||
| 291 | } | ||
| 292 | |||
| 293 | #[test] | ||
| 294 | fn test_connection_metrics() { | ||
| 295 | let registry = create_test_registry(); | ||
| 296 | let metrics = SyncMetrics::register(®istry).unwrap(); | ||
| 297 | |||
| 298 | metrics.record_connection_attempt("wss://relay1.example.com", true); | ||
| 299 | metrics.record_connection_attempt("wss://relay1.example.com", false); | ||
| 300 | metrics.record_connection_attempt("wss://relay2.example.com", true); | ||
| 301 | |||
| 302 | metrics.set_relay_connected("wss://relay1.example.com", true); | ||
| 303 | metrics.inc_connected_count(); | ||
| 304 | |||
| 305 | assert_eq!(metrics.get_connected_count(), 1); | ||
| 306 | } | ||
| 307 | |||
| 308 | #[test] | ||
| 309 | fn test_health_metrics() { | ||
| 310 | let registry = create_test_registry(); | ||
| 311 | let metrics = SyncMetrics::register(®istry).unwrap(); | ||
| 312 | |||
| 313 | metrics.record_health_state("wss://relay1.example.com", HealthState::Healthy); | ||
| 314 | metrics.record_health_state("wss://relay2.example.com", HealthState::Degraded); | ||
| 315 | metrics.record_health_state("wss://relay3.example.com", HealthState::Dead); | ||
| 316 | |||
| 317 | metrics.record_failure_count("wss://relay2.example.com", 5); | ||
| 318 | metrics.update_dead_count(1); | ||
| 319 | |||
| 320 | assert_eq!(metrics.get_dead_count(), 1); | ||
| 321 | } | ||
| 322 | |||
| 323 | #[test] | ||
| 324 | fn test_event_metrics() { | ||
| 325 | let registry = create_test_registry(); | ||
| 326 | let metrics = SyncMetrics::register(®istry).unwrap(); | ||
| 327 | |||
| 328 | metrics.record_event(event_source::LIVE); | ||
| 329 | metrics.record_events(event_source::STARTUP, 10); | ||
| 330 | metrics.record_gap_event("wss://relay1.example.com"); | ||
| 331 | metrics.record_gap_events("wss://relay2.example.com", 5); | ||
| 332 | } | ||
| 333 | |||
| 334 | #[test] | ||
| 335 | fn test_summary_metrics() { | ||
| 336 | let registry = create_test_registry(); | ||
| 337 | let metrics = SyncMetrics::register(®istry).unwrap(); | ||
| 338 | |||
| 339 | metrics.set_tracked_count(5); | ||
| 340 | assert_eq!(metrics.get_tracked_count(), 5); | ||
| 341 | |||
| 342 | metrics.inc_tracked_count(); | ||
| 343 | assert_eq!(metrics.get_tracked_count(), 6); | ||
| 344 | |||
| 345 | metrics.update_connected_count(3); | ||
| 346 | assert_eq!(metrics.get_connected_count(), 3); | ||
| 347 | } | ||
| 348 | } \ No newline at end of file | ||