upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/src/sync/metrics.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/sync/metrics.rs')
-rw-r--r--src/sync/metrics.rs348
1 files changed, 348 insertions, 0 deletions
diff --git a/src/sync/metrics.rs b/src/sync/metrics.rs
new file mode 100644
index 0000000..c93e583
--- /dev/null
+++ b/src/sync/metrics.rs
@@ -0,0 +1,348 @@
1//! Prometheus Metrics for Proactive Sync (GRASP-02 Phase 6)
2//!
3//! This module provides comprehensive sync monitoring metrics including:
4//! - Connection status and attempts per relay
5//! - Health state tracking (Healthy/Degraded/Dead)
6//! - Event sync tracking by source (live/startup/reconnect/daily catchup)
7//! - Gap events filled during catchup operations
8//!
9//! All metrics follow the `ngit_sync_` prefix convention.
10
11use prometheus::{IntCounterVec, IntGauge, IntGaugeVec, Opts, Registry};
12
13use super::health::HealthState;
14
15/// Prometheus metrics for the proactive sync system
16#[derive(Clone)]
17pub struct SyncMetrics {
18 // === Connection metrics ===
19 /// Per-relay connection status (1=connected, 0=disconnected)
20 relay_connected: IntGaugeVec,
21 /// Connection attempts by relay and result (success/failure)
22 connection_attempts_total: IntCounterVec,
23
24 // === Health metrics ===
25 /// Per-relay health status (healthy=1, degraded=2, dead=3)
26 relay_status: IntGaugeVec,
27 /// Per-relay consecutive failure count
28 relay_failures: IntGaugeVec,
29
30 // === Event metrics ===
31 /// Events synced by source (live/startup/reconnect/daily)
32 events_total: IntCounterVec,
33 /// Gap events filled during catchup, by relay
34 gap_events_total: IntCounterVec,
35
36 // === Summary metrics ===
37 /// Total relays discovered and tracked
38 relays_tracked_total: IntGauge,
39 /// Currently connected relay count
40 relays_connected_total: IntGauge,
41 /// Relays marked as dead
42 relays_dead_total: IntGauge,
43}
44
45impl SyncMetrics {
46 /// Register all sync metrics with the provided Prometheus registry
47 pub fn register(registry: &Registry) -> Result<Self, prometheus::Error> {
48 // Connection metrics
49 let relay_connected = IntGaugeVec::new(
50 Opts::new(
51 "ngit_sync_relay_connected",
52 "Relay connection status (1=connected, 0=disconnected)",
53 ),
54 &["relay"],
55 )?;
56 registry.register(Box::new(relay_connected.clone()))?;
57
58 let connection_attempts_total = IntCounterVec::new(
59 Opts::new(
60 "ngit_sync_connection_attempts_total",
61 "Total connection attempts by relay and result",
62 ),
63 &["relay", "result"],
64 )?;
65 registry.register(Box::new(connection_attempts_total.clone()))?;
66
67 // Health metrics
68 let relay_status = IntGaugeVec::new(
69 Opts::new(
70 "ngit_sync_relay_status",
71 "Relay health status (1=healthy, 2=degraded, 3=dead)",
72 ),
73 &["relay"],
74 )?;
75 registry.register(Box::new(relay_status.clone()))?;
76
77 let relay_failures = IntGaugeVec::new(
78 Opts::new(
79 "ngit_sync_relay_failures",
80 "Consecutive failure count per relay",
81 ),
82 &["relay"],
83 )?;
84 registry.register(Box::new(relay_failures.clone()))?;
85
86 // Event metrics
87 let events_total = IntCounterVec::new(
88 Opts::new(
89 "ngit_sync_events_total",
90 "Total events synced by source type",
91 ),
92 &["source"],
93 )?;
94 registry.register(Box::new(events_total.clone()))?;
95
96 let gap_events_total = IntCounterVec::new(
97 Opts::new(
98 "ngit_sync_gap_events_total",
99 "Gap events filled during catchup by relay",
100 ),
101 &["relay"],
102 )?;
103 registry.register(Box::new(gap_events_total.clone()))?;
104
105 // Summary metrics
106 let relays_tracked_total = IntGauge::with_opts(Opts::new(
107 "ngit_sync_relays_tracked_total",
108 "Total number of relays discovered and tracked",
109 ))?;
110 registry.register(Box::new(relays_tracked_total.clone()))?;
111
112 let relays_connected_total = IntGauge::with_opts(Opts::new(
113 "ngit_sync_relays_connected_total",
114 "Number of currently connected relays",
115 ))?;
116 registry.register(Box::new(relays_connected_total.clone()))?;
117
118 let relays_dead_total = IntGauge::with_opts(Opts::new(
119 "ngit_sync_relays_dead_total",
120 "Number of relays marked as dead",
121 ))?;
122 registry.register(Box::new(relays_dead_total.clone()))?;
123
124 Ok(Self {
125 relay_connected,
126 connection_attempts_total,
127 relay_status,
128 relay_failures,
129 events_total,
130 gap_events_total,
131 relays_tracked_total,
132 relays_connected_total,
133 relays_dead_total,
134 })
135 }
136
137 // === Connection Recording Methods ===
138
139 /// Record a connection attempt (success or failure)
140 pub fn record_connection_attempt(&self, relay: &str, success: bool) {
141 let result = if success { "success" } else { "failure" };
142 self.connection_attempts_total
143 .with_label_values(&[relay, result])
144 .inc();
145 }
146
147 /// Set relay connection status
148 pub fn set_relay_connected(&self, relay: &str, connected: bool) {
149 self.relay_connected
150 .with_label_values(&[relay])
151 .set(if connected { 1 } else { 0 });
152
153 // Update connected count based on all relay values
154 // This is handled by update_connected_count() for accuracy
155 }
156
157 /// Update the total connected relay count
158 pub fn update_connected_count(&self, count: i64) {
159 self.relays_connected_total.set(count);
160 }
161
162 /// Increment connected count
163 pub fn inc_connected_count(&self) {
164 self.relays_connected_total.inc();
165 }
166
167 /// Decrement connected count
168 pub fn dec_connected_count(&self) {
169 self.relays_connected_total.dec();
170 }
171
172 // === Health Recording Methods ===
173
174 /// Record relay health state change
175 pub fn record_health_state(&self, relay: &str, state: HealthState) {
176 let state_value = match state {
177 HealthState::Healthy => 1,
178 HealthState::Degraded => 2,
179 HealthState::Dead => 3,
180 };
181 self.relay_status.with_label_values(&[relay]).set(state_value);
182 }
183
184 /// Record relay failure count
185 pub fn record_failure_count(&self, relay: &str, count: u32) {
186 self.relay_failures
187 .with_label_values(&[relay])
188 .set(count as i64);
189 }
190
191 /// Update dead relay count
192 pub fn update_dead_count(&self, count: i64) {
193 self.relays_dead_total.set(count);
194 }
195
196 /// Increment dead relay count
197 pub fn inc_dead_count(&self) {
198 self.relays_dead_total.inc();
199 }
200
201 /// Decrement dead relay count
202 pub fn dec_dead_count(&self) {
203 self.relays_dead_total.dec();
204 }
205
206 // === Event Recording Methods ===
207
208 /// Record a synced event by source type
209 ///
210 /// Source types:
211 /// - "live" - Real-time subscription events
212 /// - "startup" - Events from startup catchup
213 /// - "reconnect" - Events from reconnection catchup
214 /// - "daily" - Events from daily catchup
215 pub fn record_event(&self, source: &str) {
216 self.events_total.with_label_values(&[source]).inc();
217 }
218
219 /// Record multiple events synced by source type
220 pub fn record_events(&self, source: &str, count: u64) {
221 self.events_total
222 .with_label_values(&[source])
223 .inc_by(count);
224 }
225
226 /// Record a gap event filled during catchup
227 pub fn record_gap_event(&self, relay: &str) {
228 self.gap_events_total.with_label_values(&[relay]).inc();
229 }
230
231 /// Record multiple gap events filled during catchup
232 pub fn record_gap_events(&self, relay: &str, count: u64) {
233 self.gap_events_total
234 .with_label_values(&[relay])
235 .inc_by(count);
236 }
237
238 // === Summary Recording Methods ===
239
240 /// Set the total tracked relay count
241 pub fn set_tracked_count(&self, count: i64) {
242 self.relays_tracked_total.set(count);
243 }
244
245 /// Increment tracked relay count
246 pub fn inc_tracked_count(&self) {
247 self.relays_tracked_total.inc();
248 }
249
250 /// Get current tracked relay count
251 pub fn get_tracked_count(&self) -> i64 {
252 self.relays_tracked_total.get()
253 }
254
255 /// Get current connected relay count
256 pub fn get_connected_count(&self) -> i64 {
257 self.relays_connected_total.get()
258 }
259
260 /// Get current dead relay count
261 pub fn get_dead_count(&self) -> i64 {
262 self.relays_dead_total.get()
263 }
264}
265
266/// Event source types for metrics tracking
267pub mod event_source {
268 /// Real-time subscription events
269 pub const LIVE: &str = "live";
270 /// Events from startup catchup
271 pub const STARTUP: &str = "startup";
272 /// Events from reconnection catchup
273 pub const RECONNECT: &str = "reconnect";
274 /// Events from daily catchup
275 pub const DAILY: &str = "daily";
276}
277
278#[cfg(test)]
279mod tests {
280 use super::*;
281
282 fn create_test_registry() -> Registry {
283 Registry::new()
284 }
285
286 #[test]
287 fn test_metrics_registration() {
288 let registry = create_test_registry();
289 let metrics = SyncMetrics::register(&registry);
290 assert!(metrics.is_ok());
291 }
292
293 #[test]
294 fn test_connection_metrics() {
295 let registry = create_test_registry();
296 let metrics = SyncMetrics::register(&registry).unwrap();
297
298 metrics.record_connection_attempt("wss://relay1.example.com", true);
299 metrics.record_connection_attempt("wss://relay1.example.com", false);
300 metrics.record_connection_attempt("wss://relay2.example.com", true);
301
302 metrics.set_relay_connected("wss://relay1.example.com", true);
303 metrics.inc_connected_count();
304
305 assert_eq!(metrics.get_connected_count(), 1);
306 }
307
308 #[test]
309 fn test_health_metrics() {
310 let registry = create_test_registry();
311 let metrics = SyncMetrics::register(&registry).unwrap();
312
313 metrics.record_health_state("wss://relay1.example.com", HealthState::Healthy);
314 metrics.record_health_state("wss://relay2.example.com", HealthState::Degraded);
315 metrics.record_health_state("wss://relay3.example.com", HealthState::Dead);
316
317 metrics.record_failure_count("wss://relay2.example.com", 5);
318 metrics.update_dead_count(1);
319
320 assert_eq!(metrics.get_dead_count(), 1);
321 }
322
323 #[test]
324 fn test_event_metrics() {
325 let registry = create_test_registry();
326 let metrics = SyncMetrics::register(&registry).unwrap();
327
328 metrics.record_event(event_source::LIVE);
329 metrics.record_events(event_source::STARTUP, 10);
330 metrics.record_gap_event("wss://relay1.example.com");
331 metrics.record_gap_events("wss://relay2.example.com", 5);
332 }
333
334 #[test]
335 fn test_summary_metrics() {
336 let registry = create_test_registry();
337 let metrics = SyncMetrics::register(&registry).unwrap();
338
339 metrics.set_tracked_count(5);
340 assert_eq!(metrics.get_tracked_count(), 5);
341
342 metrics.inc_tracked_count();
343 assert_eq!(metrics.get_tracked_count(), 6);
344
345 metrics.update_connected_count(3);
346 assert_eq!(metrics.get_connected_count(), 3);
347 }
348} \ No newline at end of file