diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 13:35:08 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2025-12-04 13:35:08 +0000 |
| commit | 762cd8e815e797f173f541795de774fbbf978fc3 (patch) | |
| tree | 8a4e62b2cfc27c5a1b815382e7b04ab821534df3 /docs/explanation | |
| parent | d9bc5ed7fddef3a26de8e69a7124e1dbe5b8602f (diff) | |
docs: add monitoring-strategy
Diffstat (limited to 'docs/explanation')
| -rw-r--r-- | docs/explanation/monitoring-strategy.md | 462 |
1 files changed, 462 insertions, 0 deletions
diff --git a/docs/explanation/monitoring-strategy.md b/docs/explanation/monitoring-strategy.md new file mode 100644 index 0000000..4668305 --- /dev/null +++ b/docs/explanation/monitoring-strategy.md | |||
| @@ -0,0 +1,462 @@ | |||
| 1 | # Monitoring Strategy - Design Document | ||
| 2 | |||
| 3 | ## Overview | ||
| 4 | |||
| 5 | This document describes the logging and monitoring strategy for ngit-grasp, designed to help administrators: | ||
| 6 | |||
| 7 | 1. Monitor WebSocket connections per unique IP | ||
| 8 | 2. Correlate resource spikes (memory, CPU) with usage patterns | ||
| 9 | 3. Detect potential abuse (too many connections from single IP) | ||
| 10 | 4. Support future load-based scheduling of background jobs (GRASP-02 sync) | ||
| 11 | |||
| 12 | ## Architecture | ||
| 13 | |||
| 14 | ```mermaid | ||
| 15 | flowchart TB | ||
| 16 | subgraph ngit-grasp | ||
| 17 | HTTP[HTTP Service] | ||
| 18 | WS[WebSocket Handler] | ||
| 19 | GIT[Git Handlers] | ||
| 20 | RELAY[Nostr Relay] | ||
| 21 | |||
| 22 | subgraph Metrics Module | ||
| 23 | REG[Prometheus Registry] | ||
| 24 | CT[ConnectionTracker] | ||
| 25 | MC[Metric Counters] | ||
| 26 | end | ||
| 27 | |||
| 28 | ME[/metrics endpoint] | ||
| 29 | end | ||
| 30 | |||
| 31 | subgraph External | ||
| 32 | PROM[Prometheus Server] | ||
| 33 | GRAF[Grafana] | ||
| 34 | ADMIN[Admin Browser] | ||
| 35 | end | ||
| 36 | |||
| 37 | HTTP --> ME | ||
| 38 | WS --> CT | ||
| 39 | WS --> MC | ||
| 40 | GIT --> MC | ||
| 41 | RELAY --> MC | ||
| 42 | |||
| 43 | CT --> REG | ||
| 44 | MC --> REG | ||
| 45 | REG --> ME | ||
| 46 | |||
| 47 | PROM -->|scrape /metrics| ME | ||
| 48 | GRAF -->|query| PROM | ||
| 49 | ADMIN -->|view dashboards| GRAF | ||
| 50 | ``` | ||
| 51 | |||
| 52 | ## Metric Categories | ||
| 53 | |||
| 54 | ### 1. WebSocket Connection Metrics | ||
| 55 | |||
| 56 | | Metric Name | Type | Labels | Description | | ||
| 57 | |------------|------|--------|-------------| | ||
| 58 | | `ngit_websocket_connections_total` | Counter | - | Total WebSocket connections since startup | | ||
| 59 | | `ngit_websocket_connections_active` | Gauge | - | Current active WebSocket connections | | ||
| 60 | | `ngit_websocket_unique_ips` | Gauge | - | Number of unique IP addresses connected (NOT the IPs themselves) | | ||
| 61 | | `ngit_websocket_flagged_abusers` | Gauge | - | Number of IPs exceeding connection threshold | | ||
| 62 | | `ngit_websocket_connection_duration_seconds` | Histogram | - | Duration of WebSocket connections | | ||
| 63 | | `ngit_websocket_messages_received_total` | Counter | `type` | Messages received (REQ, EVENT, CLOSE) | | ||
| 64 | | `ngit_websocket_messages_sent_total` | Counter | `type` | Messages sent (EVENT, EOSE, OK, NOTICE) | | ||
| 65 | |||
| 66 | **Privacy Note:** IP addresses are NEVER exposed in metrics. The `ConnectionTracker` maintains per-IP counts internally only for abuse detection, logging warnings when thresholds are exceeded. | ||
| 67 | |||
| 68 | ### 2. Git Operation Metrics | ||
| 69 | |||
| 70 | | Metric Name | Type | Labels | Description | | ||
| 71 | |------------|------|--------|-------------| | ||
| 72 | | `ngit_git_operations_total` | Counter | `operation`, `status` | Git operations (clone, fetch, push) | | ||
| 73 | | `ngit_git_operation_duration_seconds` | Histogram | `operation` | Duration of git operations | | ||
| 74 | | `ngit_git_bytes_total` | Counter | `direction` | Total bytes in/out for git operations | | ||
| 75 | | `ngit_git_push_authorization_total` | Counter | `result` | Push auth results (allowed, denied, error) | | ||
| 76 | |||
| 77 | ### 3. Top-N Repository Bandwidth Tracking | ||
| 78 | |||
| 79 | To identify high-bandwidth repositories without creating cardinality explosion (which doesn't scale to 1000+ repos), we use a hybrid approach: | ||
| 80 | |||
| 81 | | Metric Name | Type | Labels | Description | | ||
| 82 | |------------|------|--------|-------------| | ||
| 83 | | `ngit_git_top_repos_bytes` | Gauge | `repo` | Top 10 repositories by bandwidth (refreshed every 60s) | | ||
| 84 | |||
| 85 | **How it works:** | ||
| 86 | - All per-repo bandwidth is tracked internally in a `HashMap<RepoId, u64>` | ||
| 87 | - Every 60 seconds, the top 10 are calculated and exposed to Prometheus | ||
| 88 | - Previous repo labels are cleared before setting new ones | ||
| 89 | - Prometheus only ever sees ~10 label values, keeping cardinality low | ||
| 90 | |||
| 91 | ```rust | ||
| 92 | struct BandwidthTracker { | ||
| 93 | // Internal: tracks ALL repos (memory only, not exposed) | ||
| 94 | all_repos: DashMap<String, u64>, | ||
| 95 | |||
| 96 | // Exposed to Prometheus: only top 10 | ||
| 97 | top_repos_gauge: GaugeVec, | ||
| 98 | |||
| 99 | // Refresh interval | ||
| 100 | last_refresh: Instant, | ||
| 101 | } | ||
| 102 | |||
| 103 | impl BandwidthTracker { | ||
| 104 | fn record_transfer(&self, repo_id: &str, bytes: u64) { | ||
| 105 | self.all_repos | ||
| 106 | .entry(repo_id.to_string()) | ||
| 107 | .and_modify(|v| *v += bytes) | ||
| 108 | .or_insert(bytes); | ||
| 109 | } | ||
| 110 | |||
| 111 | fn maybe_refresh_top_n(&self) { | ||
| 112 | if self.last_refresh.elapsed() > Duration::from_secs(60) { | ||
| 113 | self.refresh_top_n(); | ||
| 114 | } | ||
| 115 | } | ||
| 116 | |||
| 117 | fn refresh_top_n(&self) { | ||
| 118 | let mut sorted: Vec<_> = self.all_repos.iter() | ||
| 119 | .map(|r| (r.key().clone(), *r.value())) | ||
| 120 | .collect(); | ||
| 121 | sorted.sort_by(|a, b| b.1.cmp(&a.1)); | ||
| 122 | |||
| 123 | // Clear old labels, set new top 10 | ||
| 124 | self.top_repos_gauge.reset(); | ||
| 125 | for (repo, bytes) in sorted.into_iter().take(10) { | ||
| 126 | self.top_repos_gauge | ||
| 127 | .with_label_values(&[&repo]) | ||
| 128 | .set(bytes as i64); | ||
| 129 | } | ||
| 130 | } | ||
| 131 | } | ||
| 132 | ``` | ||
| 133 | |||
| 134 | ### 4. Nostr Event Metrics | ||
| 135 | |||
| 136 | | Metric Name | Type | Labels | Description | | ||
| 137 | |------------|------|--------|-------------| | ||
| 138 | | `ngit_events_received_total` | Counter | `kind` | Events received by kind | | ||
| 139 | | `ngit_events_stored_total` | Counter | `kind` | Events successfully stored | | ||
| 140 | | `ngit_events_rejected_total` | Counter | `kind`, `reason` | Events rejected and why | | ||
| 141 | |||
| 142 | ### 5. Repository Metrics | ||
| 143 | |||
| 144 | | Metric Name | Type | Labels | Description | | ||
| 145 | |------------|------|--------|-------------| | ||
| 146 | | `ngit_repositories_total` | Gauge | - | Total repositories hosted | | ||
| 147 | |||
| 148 | ### 6. System Health Metrics | ||
| 149 | |||
| 150 | | Metric Name | Type | Labels | Description | | ||
| 151 | |------------|------|--------|-------------| | ||
| 152 | | `ngit_uptime_seconds` | Counter | - | Seconds since startup | | ||
| 153 | | `ngit_build_info` | Gauge | `version`, `commit` | Build information | | ||
| 154 | |||
| 155 | ### 7. Future: Sync Metrics (GRASP-02) | ||
| 156 | |||
| 157 | | Metric Name | Type | Labels | Description | | ||
| 158 | |------------|------|--------|-------------| | ||
| 159 | | `ngit_sync_events_received_total` | Counter | `source` | Events from sync (live vs catchup) | | ||
| 160 | | `ngit_sync_relay_connections_active` | Gauge | - | Active outbound relay connections | | ||
| 161 | | `ngit_sync_catchup_gap_total` | Counter | - | Events found during catchup (sync failures) | | ||
| 162 | |||
| 163 | ## Connection Tracker Design | ||
| 164 | |||
| 165 | The `ConnectionTracker` maintains per-IP connection counts internally for abuse detection. **IP addresses are never exposed in metrics** - only aggregate counts. | ||
| 166 | |||
| 167 | ```mermaid | ||
| 168 | flowchart LR | ||
| 169 | subgraph ConnectionTracker | ||
| 170 | HM[Internal: HashMap IP to Count] | ||
| 171 | TH[Abuse Threshold] | ||
| 172 | CNT[Exposed: Unique IP Count] | ||
| 173 | FLAG[Exposed: Abuse Flag Count] | ||
| 174 | end | ||
| 175 | |||
| 176 | CONN[New Connection] --> CHECK{Count >= Threshold?} | ||
| 177 | CHECK -->|No| INC[Increment Count] | ||
| 178 | CHECK -->|Yes| FLAG_IT[Flag as Abuse] | ||
| 179 | FLAG_IT --> LOG[Log Warning - IP in log only] | ||
| 180 | FLAG_IT --> FLAG | ||
| 181 | |||
| 182 | DISC[Disconnection] --> DEC[Decrement Count] | ||
| 183 | DEC --> CLEAN{Count == 0?} | ||
| 184 | CLEAN -->|Yes| RM[Remove from Map] | ||
| 185 | |||
| 186 | HM --> CNT | ||
| 187 | ``` | ||
| 188 | |||
| 189 | ### Data Structure | ||
| 190 | |||
| 191 | ```rust | ||
| 192 | pub struct ConnectionTracker { | ||
| 193 | /// Active connections per IP (INTERNAL ONLY - never exposed to metrics) | ||
| 194 | connections: DashMap<IpAddr, ConnectionInfo>, | ||
| 195 | /// Threshold for abuse flagging | ||
| 196 | abuse_threshold: u32, | ||
| 197 | /// Prometheus gauges (aggregate counts only, no IPs) | ||
| 198 | active_connections: IntGauge, // Total connections | ||
| 199 | unique_ips: IntGauge, // len() of HashMap | ||
| 200 | flagged_abusers: IntGauge, // Count where flagged_as_abuse == true | ||
| 201 | } | ||
| 202 | |||
| 203 | struct ConnectionInfo { | ||
| 204 | count: u32, | ||
| 205 | first_seen: Instant, | ||
| 206 | flagged_as_abuse: bool, | ||
| 207 | } | ||
| 208 | ``` | ||
| 209 | |||
| 210 | ### What Gets Exposed vs Internal | ||
| 211 | |||
| 212 | | Data | Location | Exposed? | | ||
| 213 | |------|----------|----------| | ||
| 214 | | Total connections | Prometheus | ✅ Yes | | ||
| 215 | | Unique IP count | Prometheus | ✅ Yes | | ||
| 216 | | Flagged abuser count | Prometheus | ✅ Yes | | ||
| 217 | | Actual IP addresses | Internal HashMap | ❌ No | | ||
| 218 | | IP + abuse flag | Logs (when flagged) | ⚠️ Logs only | | ||
| 219 | |||
| 220 | ### Thread Safety | ||
| 221 | |||
| 222 | Using `DashMap` for lock-free concurrent access, as connection tracking happens across multiple tokio tasks. | ||
| 223 | |||
| 224 | ## /metrics Endpoint | ||
| 225 | |||
| 226 | The `/metrics` endpoint returns Prometheus text format: | ||
| 227 | |||
| 228 | ``` | ||
| 229 | # HELP ngit_websocket_connections_active Current active WebSocket connections | ||
| 230 | # TYPE ngit_websocket_connections_active gauge | ||
| 231 | ngit_websocket_connections_active 23 | ||
| 232 | |||
| 233 | # HELP ngit_websocket_connections_by_ip Active connections per IP | ||
| 234 | # TYPE ngit_websocket_connections_by_ip gauge | ||
| 235 | ngit_websocket_connections_by_ip{ip="192.168.1.100"} 2 | ||
| 236 | ngit_websocket_connections_by_ip{ip="10.0.0.50"} 5 | ||
| 237 | |||
| 238 | # HELP ngit_git_operations_total Git operations by type and status | ||
| 239 | # TYPE ngit_git_operations_total counter | ||
| 240 | ngit_git_operations_total{operation="clone",status="success"} 1247 | ||
| 241 | ngit_git_operations_total{operation="push",status="denied"} 12 | ||
| 242 | ``` | ||
| 243 | |||
| 244 | ## Integration Points | ||
| 245 | |||
| 246 | ### HTTP Service Integration | ||
| 247 | |||
| 248 | In [`src/http/mod.rs`](../../src/http/mod.rs): | ||
| 249 | |||
| 250 | ```rust | ||
| 251 | // Add to HttpService | ||
| 252 | struct HttpService { | ||
| 253 | // ... existing fields ... | ||
| 254 | metrics: Arc<Metrics>, | ||
| 255 | } | ||
| 256 | |||
| 257 | // Add /metrics route handling | ||
| 258 | if path == "/metrics" { | ||
| 259 | let metrics_output = self.metrics.render(); | ||
| 260 | return Ok(Response::builder() | ||
| 261 | .status(200) | ||
| 262 | .header("content-type", "text/plain; version=0.0.4") | ||
| 263 | .body(Full::new(Bytes::from(metrics_output))) | ||
| 264 | .unwrap()); | ||
| 265 | } | ||
| 266 | ``` | ||
| 267 | |||
| 268 | ### WebSocket Connection Tracking | ||
| 269 | |||
| 270 | In the WebSocket upgrade handler: | ||
| 271 | |||
| 272 | ```rust | ||
| 273 | // On connection | ||
| 274 | let ip = addr.ip(); | ||
| 275 | metrics.connection_tracker.on_connect(ip); | ||
| 276 | |||
| 277 | // Spawn connection handler | ||
| 278 | tokio::spawn(async move { | ||
| 279 | // ... handle connection ... | ||
| 280 | // On disconnect | ||
| 281 | metrics.connection_tracker.on_disconnect(ip); | ||
| 282 | }); | ||
| 283 | ``` | ||
| 284 | |||
| 285 | ### Git Handler Integration | ||
| 286 | |||
| 287 | In [`src/git/handlers.rs`](../../src/git/handlers.rs): | ||
| 288 | |||
| 289 | ```rust | ||
| 290 | // Wrap git operations with metrics | ||
| 291 | let timer = metrics.git_operation_duration.start_timer(); | ||
| 292 | let result = git::handlers::handle_upload_pack(repo_path, body_bytes).await; | ||
| 293 | timer.observe_duration(); | ||
| 294 | |||
| 295 | metrics.git_operations_total | ||
| 296 | .with_label_values(&["clone", result_status]) | ||
| 297 | .inc(); | ||
| 298 | ``` | ||
| 299 | |||
| 300 | ## Configuration | ||
| 301 | |||
| 302 | New configuration options in [`src/config.rs`](../../src/config.rs): | ||
| 303 | |||
| 304 | | Option | CLI Flag | Environment Variable | Default | Description | | ||
| 305 | |--------|----------|---------------------|---------|-------------| | ||
| 306 | | Metrics enabled | `--metrics-enabled` | `NGIT_METRICS_ENABLED` | `true` | Enable /metrics endpoint | | ||
| 307 | | Abuse threshold | `--abuse-threshold` | `NGIT_ABUSE_THRESHOLD` | `10` | Max connections per IP before flagging | | ||
| 308 | | Metrics path | `--metrics-path` | `NGIT_METRICS_PATH` | `/metrics` | Path for metrics endpoint | | ||
| 309 | |||
| 310 | ## Crate Dependencies | ||
| 311 | |||
| 312 | Add to `Cargo.toml`: | ||
| 313 | |||
| 314 | ```toml | ||
| 315 | # Metrics | ||
| 316 | prometheus = "0.13" | ||
| 317 | dashmap = "5" # Lock-free concurrent HashMap | ||
| 318 | lazy_static = "1.4" # For static metric registration | ||
| 319 | ``` | ||
| 320 | |||
| 321 | ## Module Structure | ||
| 322 | |||
| 323 | ``` | ||
| 324 | src/ | ||
| 325 | ├── metrics/ | ||
| 326 | │ ├── mod.rs # Module exports, Metrics struct | ||
| 327 | │ ├── connection.rs # ConnectionTracker implementation | ||
| 328 | │ ├── definitions.rs # Metric definitions (lazy_static!) | ||
| 329 | │ └── render.rs # Prometheus format rendering | ||
| 330 | ├── http/ | ||
| 331 | │ └── mod.rs # Add /metrics route | ||
| 332 | └── ... | ||
| 333 | ``` | ||
| 334 | |||
| 335 | ## Grafana Dashboard | ||
| 336 | |||
| 337 | A pre-built Grafana dashboard will be provided at `docs/grafana/ngit-grasp-dashboard.json` with panels for: | ||
| 338 | |||
| 339 | 1. **Overview Row** | ||
| 340 | - Active connections (gauge) | ||
| 341 | - Requests per second (graph) | ||
| 342 | - Git operations per minute (graph) | ||
| 343 | |||
| 344 | 2. **Connections Row** | ||
| 345 | - Active connections over time | ||
| 346 | - Connections by IP (top 10) | ||
| 347 | - Flagged abuse IPs (table) | ||
| 348 | |||
| 349 | 3. **Git Operations Row** | ||
| 350 | - Clone/fetch/push rates | ||
| 351 | - Push authorization results (pie chart) | ||
| 352 | - Operation duration percentiles | ||
| 353 | |||
| 354 | 4. **Events Row** | ||
| 355 | - Events received by kind | ||
| 356 | - Events rejected by reason | ||
| 357 | - Active subscriptions | ||
| 358 | |||
| 359 | ## Deployment: Prometheus on NixOS | ||
| 360 | |||
| 361 | Example NixOS configuration for Prometheus: | ||
| 362 | |||
| 363 | ```nix | ||
| 364 | services.prometheus = { | ||
| 365 | enable = true; | ||
| 366 | scrapeConfigs = [ | ||
| 367 | { | ||
| 368 | job_name = "ngit-grasp"; | ||
| 369 | static_configs = [{ | ||
| 370 | targets = [ "localhost:8080" ]; # ngit-grasp bind address | ||
| 371 | }]; | ||
| 372 | scrape_interval = "15s"; | ||
| 373 | metrics_path = "/metrics"; | ||
| 374 | } | ||
| 375 | ]; | ||
| 376 | }; | ||
| 377 | |||
| 378 | services.grafana = { | ||
| 379 | enable = true; | ||
| 380 | settings.server.http_port = 3000; | ||
| 381 | provision.datasources.settings.datasources = [{ | ||
| 382 | name = "Prometheus"; | ||
| 383 | type = "prometheus"; | ||
| 384 | url = "http://localhost:9090"; | ||
| 385 | }]; | ||
| 386 | }; | ||
| 387 | ``` | ||
| 388 | |||
| 389 | ## Future: Load-Based Sync Scheduling | ||
| 390 | |||
| 391 | The metrics infrastructure enables future load-based scheduling for GRASP-02 sync jobs: | ||
| 392 | |||
| 393 | ```mermaid | ||
| 394 | flowchart TD | ||
| 395 | SYNC[Sync Manager] --> CHECK{Check Load} | ||
| 396 | CHECK --> MET[Query Metrics] | ||
| 397 | MET --> CPU{CPU > 80%?} | ||
| 398 | CPU -->|Yes| DELAY[Delay 5 min] | ||
| 399 | CPU -->|No| CONN{Connections > N?} | ||
| 400 | CONN -->|Yes| DELAY | ||
| 401 | CONN -->|No| RUN[Run Sync Job] | ||
| 402 | DELAY --> CHECK | ||
| 403 | ``` | ||
| 404 | |||
| 405 | The `Metrics` struct will expose a method for checking load: | ||
| 406 | |||
| 407 | ```rust | ||
| 408 | impl Metrics { | ||
| 409 | /// Check if system is under high load | ||
| 410 | pub fn is_high_load(&self) -> bool { | ||
| 411 | let active = self.websocket_connections_active.get(); | ||
| 412 | active > self.config.high_load_threshold | ||
| 413 | } | ||
| 414 | } | ||
| 415 | ``` | ||
| 416 | |||
| 417 | ## Future Enhancement: Loki for Detailed Logging | ||
| 418 | |||
| 419 | For detailed per-repository investigation at scale, consider adding **Loki** (log aggregation) in a future iteration: | ||
| 420 | |||
| 421 | ```rust | ||
| 422 | // Structured logging with tracing | ||
| 423 | tracing::info!( | ||
| 424 | repo = %repo_id, | ||
| 425 | npub = %npub, | ||
| 426 | bytes = bytes_transferred, | ||
| 427 | operation = "clone", | ||
| 428 | duration_ms = elapsed.as_millis(), | ||
| 429 | "git_transfer_complete" | ||
| 430 | ); | ||
| 431 | ``` | ||
| 432 | |||
| 433 | Loki query examples: | ||
| 434 | ```logql | ||
| 435 | # Find all transfers > 10MB | ||
| 436 | {job="ngit-grasp"} |= "git_transfer_complete" | json | bytes > 10000000 | ||
| 437 | |||
| 438 | # Sum bytes by repo in last hour | ||
| 439 | sum by (repo) ( | ||
| 440 | {job="ngit-grasp"} |= "git_transfer_complete" | json | unwrap bytes | ||
| 441 | ) | ||
| 442 | ``` | ||
| 443 | |||
| 444 | This pairs with Prometheus for long-term trends while enabling ad-hoc deep dives. | ||
| 445 | |||
| 446 | ## Privacy Considerations | ||
| 447 | |||
| 448 | - IP addresses are stored only in memory (not logged to disk by default) | ||
| 449 | - Per-IP metrics can be disabled via configuration | ||
| 450 | - Consider IP anonymization for GDPR compliance if needed | ||
| 451 | |||
| 452 | ## Summary | ||
| 453 | |||
| 454 | | Component | Purpose | | ||
| 455 | |-----------|---------| | ||
| 456 | | `Metrics` struct | Central registry and access point | | ||
| 457 | | `ConnectionTracker` | Per-IP tracking with abuse detection | | ||
| 458 | | `/metrics` endpoint | Prometheus scraping interface | | ||
| 459 | | Grafana dashboard | Visualization and analysis | | ||
| 460 | | NixOS config | Easy deployment for operators | | ||
| 461 | |||
| 462 | This strategy provides comprehensive observability without requiring a separate database - Prometheus handles all time-series storage and Grafana provides the visualization layer. \ No newline at end of file | ||