From 762cd8e815e797f173f541795de774fbbf978fc3 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Thu, 4 Dec 2025 13:35:08 +0000 Subject: docs: add monitoring-strategy --- docs/explanation/monitoring-strategy.md | 462 ++++++++++++++++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100644 docs/explanation/monitoring-strategy.md (limited to 'docs') diff --git a/docs/explanation/monitoring-strategy.md b/docs/explanation/monitoring-strategy.md new file mode 100644 index 0000000..4668305 --- /dev/null +++ b/docs/explanation/monitoring-strategy.md @@ -0,0 +1,462 @@ +# Monitoring Strategy - Design Document + +## Overview + +This document describes the logging and monitoring strategy for ngit-grasp, designed to help administrators: + +1. Monitor WebSocket connections per unique IP +2. Correlate resource spikes (memory, CPU) with usage patterns +3. Detect potential abuse (too many connections from single IP) +4. Support future load-based scheduling of background jobs (GRASP-02 sync) + +## Architecture + +```mermaid +flowchart TB + subgraph ngit-grasp + HTTP[HTTP Service] + WS[WebSocket Handler] + GIT[Git Handlers] + RELAY[Nostr Relay] + + subgraph Metrics Module + REG[Prometheus Registry] + CT[ConnectionTracker] + MC[Metric Counters] + end + + ME[/metrics endpoint] + end + + subgraph External + PROM[Prometheus Server] + GRAF[Grafana] + ADMIN[Admin Browser] + end + + HTTP --> ME + WS --> CT + WS --> MC + GIT --> MC + RELAY --> MC + + CT --> REG + MC --> REG + REG --> ME + + PROM -->|scrape /metrics| ME + GRAF -->|query| PROM + ADMIN -->|view dashboards| GRAF +``` + +## Metric Categories + +### 1. WebSocket Connection Metrics + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_websocket_connections_total` | Counter | - | Total WebSocket connections since startup | +| `ngit_websocket_connections_active` | Gauge | - | Current active WebSocket connections | +| `ngit_websocket_unique_ips` | Gauge | - | Number of unique IP addresses connected (NOT the IPs themselves) | +| `ngit_websocket_flagged_abusers` | Gauge | - | Number of IPs exceeding connection threshold | +| `ngit_websocket_connection_duration_seconds` | Histogram | - | Duration of WebSocket connections | +| `ngit_websocket_messages_received_total` | Counter | `type` | Messages received (REQ, EVENT, CLOSE) | +| `ngit_websocket_messages_sent_total` | Counter | `type` | Messages sent (EVENT, EOSE, OK, NOTICE) | + +**Privacy Note:** IP addresses are NEVER exposed in metrics. The `ConnectionTracker` maintains per-IP counts internally only for abuse detection, logging warnings when thresholds are exceeded. + +### 2. Git Operation Metrics + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_git_operations_total` | Counter | `operation`, `status` | Git operations (clone, fetch, push) | +| `ngit_git_operation_duration_seconds` | Histogram | `operation` | Duration of git operations | +| `ngit_git_bytes_total` | Counter | `direction` | Total bytes in/out for git operations | +| `ngit_git_push_authorization_total` | Counter | `result` | Push auth results (allowed, denied, error) | + +### 3. Top-N Repository Bandwidth Tracking + +To identify high-bandwidth repositories without creating cardinality explosion (which doesn't scale to 1000+ repos), we use a hybrid approach: + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_git_top_repos_bytes` | Gauge | `repo` | Top 10 repositories by bandwidth (refreshed every 60s) | + +**How it works:** +- All per-repo bandwidth is tracked internally in a `HashMap` +- Every 60 seconds, the top 10 are calculated and exposed to Prometheus +- Previous repo labels are cleared before setting new ones +- Prometheus only ever sees ~10 label values, keeping cardinality low + +```rust +struct BandwidthTracker { + // Internal: tracks ALL repos (memory only, not exposed) + all_repos: DashMap, + + // Exposed to Prometheus: only top 10 + top_repos_gauge: GaugeVec, + + // Refresh interval + last_refresh: Instant, +} + +impl BandwidthTracker { + fn record_transfer(&self, repo_id: &str, bytes: u64) { + self.all_repos + .entry(repo_id.to_string()) + .and_modify(|v| *v += bytes) + .or_insert(bytes); + } + + fn maybe_refresh_top_n(&self) { + if self.last_refresh.elapsed() > Duration::from_secs(60) { + self.refresh_top_n(); + } + } + + fn refresh_top_n(&self) { + let mut sorted: Vec<_> = self.all_repos.iter() + .map(|r| (r.key().clone(), *r.value())) + .collect(); + sorted.sort_by(|a, b| b.1.cmp(&a.1)); + + // Clear old labels, set new top 10 + self.top_repos_gauge.reset(); + for (repo, bytes) in sorted.into_iter().take(10) { + self.top_repos_gauge + .with_label_values(&[&repo]) + .set(bytes as i64); + } + } +} +``` + +### 4. Nostr Event Metrics + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_events_received_total` | Counter | `kind` | Events received by kind | +| `ngit_events_stored_total` | Counter | `kind` | Events successfully stored | +| `ngit_events_rejected_total` | Counter | `kind`, `reason` | Events rejected and why | + +### 5. Repository Metrics + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_repositories_total` | Gauge | - | Total repositories hosted | + +### 6. System Health Metrics + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_uptime_seconds` | Counter | - | Seconds since startup | +| `ngit_build_info` | Gauge | `version`, `commit` | Build information | + +### 7. Future: Sync Metrics (GRASP-02) + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `ngit_sync_events_received_total` | Counter | `source` | Events from sync (live vs catchup) | +| `ngit_sync_relay_connections_active` | Gauge | - | Active outbound relay connections | +| `ngit_sync_catchup_gap_total` | Counter | - | Events found during catchup (sync failures) | + +## Connection Tracker Design + +The `ConnectionTracker` maintains per-IP connection counts internally for abuse detection. **IP addresses are never exposed in metrics** - only aggregate counts. + +```mermaid +flowchart LR + subgraph ConnectionTracker + HM[Internal: HashMap IP to Count] + TH[Abuse Threshold] + CNT[Exposed: Unique IP Count] + FLAG[Exposed: Abuse Flag Count] + end + + CONN[New Connection] --> CHECK{Count >= Threshold?} + CHECK -->|No| INC[Increment Count] + CHECK -->|Yes| FLAG_IT[Flag as Abuse] + FLAG_IT --> LOG[Log Warning - IP in log only] + FLAG_IT --> FLAG + + DISC[Disconnection] --> DEC[Decrement Count] + DEC --> CLEAN{Count == 0?} + CLEAN -->|Yes| RM[Remove from Map] + + HM --> CNT +``` + +### Data Structure + +```rust +pub struct ConnectionTracker { + /// Active connections per IP (INTERNAL ONLY - never exposed to metrics) + connections: DashMap, + /// Threshold for abuse flagging + abuse_threshold: u32, + /// Prometheus gauges (aggregate counts only, no IPs) + active_connections: IntGauge, // Total connections + unique_ips: IntGauge, // len() of HashMap + flagged_abusers: IntGauge, // Count where flagged_as_abuse == true +} + +struct ConnectionInfo { + count: u32, + first_seen: Instant, + flagged_as_abuse: bool, +} +``` + +### What Gets Exposed vs Internal + +| Data | Location | Exposed? | +|------|----------|----------| +| Total connections | Prometheus | ✅ Yes | +| Unique IP count | Prometheus | ✅ Yes | +| Flagged abuser count | Prometheus | ✅ Yes | +| Actual IP addresses | Internal HashMap | ❌ No | +| IP + abuse flag | Logs (when flagged) | ⚠️ Logs only | + +### Thread Safety + +Using `DashMap` for lock-free concurrent access, as connection tracking happens across multiple tokio tasks. + +## /metrics Endpoint + +The `/metrics` endpoint returns Prometheus text format: + +``` +# HELP ngit_websocket_connections_active Current active WebSocket connections +# TYPE ngit_websocket_connections_active gauge +ngit_websocket_connections_active 23 + +# HELP ngit_websocket_connections_by_ip Active connections per IP +# TYPE ngit_websocket_connections_by_ip gauge +ngit_websocket_connections_by_ip{ip="192.168.1.100"} 2 +ngit_websocket_connections_by_ip{ip="10.0.0.50"} 5 + +# HELP ngit_git_operations_total Git operations by type and status +# TYPE ngit_git_operations_total counter +ngit_git_operations_total{operation="clone",status="success"} 1247 +ngit_git_operations_total{operation="push",status="denied"} 12 +``` + +## Integration Points + +### HTTP Service Integration + +In [`src/http/mod.rs`](../../src/http/mod.rs): + +```rust +// Add to HttpService +struct HttpService { + // ... existing fields ... + metrics: Arc, +} + +// Add /metrics route handling +if path == "/metrics" { + let metrics_output = self.metrics.render(); + return Ok(Response::builder() + .status(200) + .header("content-type", "text/plain; version=0.0.4") + .body(Full::new(Bytes::from(metrics_output))) + .unwrap()); +} +``` + +### WebSocket Connection Tracking + +In the WebSocket upgrade handler: + +```rust +// On connection +let ip = addr.ip(); +metrics.connection_tracker.on_connect(ip); + +// Spawn connection handler +tokio::spawn(async move { + // ... handle connection ... + // On disconnect + metrics.connection_tracker.on_disconnect(ip); +}); +``` + +### Git Handler Integration + +In [`src/git/handlers.rs`](../../src/git/handlers.rs): + +```rust +// Wrap git operations with metrics +let timer = metrics.git_operation_duration.start_timer(); +let result = git::handlers::handle_upload_pack(repo_path, body_bytes).await; +timer.observe_duration(); + +metrics.git_operations_total + .with_label_values(&["clone", result_status]) + .inc(); +``` + +## Configuration + +New configuration options in [`src/config.rs`](../../src/config.rs): + +| Option | CLI Flag | Environment Variable | Default | Description | +|--------|----------|---------------------|---------|-------------| +| Metrics enabled | `--metrics-enabled` | `NGIT_METRICS_ENABLED` | `true` | Enable /metrics endpoint | +| Abuse threshold | `--abuse-threshold` | `NGIT_ABUSE_THRESHOLD` | `10` | Max connections per IP before flagging | +| Metrics path | `--metrics-path` | `NGIT_METRICS_PATH` | `/metrics` | Path for metrics endpoint | + +## Crate Dependencies + +Add to `Cargo.toml`: + +```toml +# Metrics +prometheus = "0.13" +dashmap = "5" # Lock-free concurrent HashMap +lazy_static = "1.4" # For static metric registration +``` + +## Module Structure + +``` +src/ +├── metrics/ +│ ├── mod.rs # Module exports, Metrics struct +│ ├── connection.rs # ConnectionTracker implementation +│ ├── definitions.rs # Metric definitions (lazy_static!) +│ └── render.rs # Prometheus format rendering +├── http/ +│ └── mod.rs # Add /metrics route +└── ... +``` + +## Grafana Dashboard + +A pre-built Grafana dashboard will be provided at `docs/grafana/ngit-grasp-dashboard.json` with panels for: + +1. **Overview Row** + - Active connections (gauge) + - Requests per second (graph) + - Git operations per minute (graph) + +2. **Connections Row** + - Active connections over time + - Connections by IP (top 10) + - Flagged abuse IPs (table) + +3. **Git Operations Row** + - Clone/fetch/push rates + - Push authorization results (pie chart) + - Operation duration percentiles + +4. **Events Row** + - Events received by kind + - Events rejected by reason + - Active subscriptions + +## Deployment: Prometheus on NixOS + +Example NixOS configuration for Prometheus: + +```nix +services.prometheus = { + enable = true; + scrapeConfigs = [ + { + job_name = "ngit-grasp"; + static_configs = [{ + targets = [ "localhost:8080" ]; # ngit-grasp bind address + }]; + scrape_interval = "15s"; + metrics_path = "/metrics"; + } + ]; +}; + +services.grafana = { + enable = true; + settings.server.http_port = 3000; + provision.datasources.settings.datasources = [{ + name = "Prometheus"; + type = "prometheus"; + url = "http://localhost:9090"; + }]; +}; +``` + +## Future: Load-Based Sync Scheduling + +The metrics infrastructure enables future load-based scheduling for GRASP-02 sync jobs: + +```mermaid +flowchart TD + SYNC[Sync Manager] --> CHECK{Check Load} + CHECK --> MET[Query Metrics] + MET --> CPU{CPU > 80%?} + CPU -->|Yes| DELAY[Delay 5 min] + CPU -->|No| CONN{Connections > N?} + CONN -->|Yes| DELAY + CONN -->|No| RUN[Run Sync Job] + DELAY --> CHECK +``` + +The `Metrics` struct will expose a method for checking load: + +```rust +impl Metrics { + /// Check if system is under high load + pub fn is_high_load(&self) -> bool { + let active = self.websocket_connections_active.get(); + active > self.config.high_load_threshold + } +} +``` + +## Future Enhancement: Loki for Detailed Logging + +For detailed per-repository investigation at scale, consider adding **Loki** (log aggregation) in a future iteration: + +```rust +// Structured logging with tracing +tracing::info!( + repo = %repo_id, + npub = %npub, + bytes = bytes_transferred, + operation = "clone", + duration_ms = elapsed.as_millis(), + "git_transfer_complete" +); +``` + +Loki query examples: +```logql +# Find all transfers > 10MB +{job="ngit-grasp"} |= "git_transfer_complete" | json | bytes > 10000000 + +# Sum bytes by repo in last hour +sum by (repo) ( + {job="ngit-grasp"} |= "git_transfer_complete" | json | unwrap bytes +) +``` + +This pairs with Prometheus for long-term trends while enabling ad-hoc deep dives. + +## Privacy Considerations + +- IP addresses are stored only in memory (not logged to disk by default) +- Per-IP metrics can be disabled via configuration +- Consider IP anonymization for GDPR compliance if needed + +## Summary + +| Component | Purpose | +|-----------|---------| +| `Metrics` struct | Central registry and access point | +| `ConnectionTracker` | Per-IP tracking with abuse detection | +| `/metrics` endpoint | Prometheus scraping interface | +| Grafana dashboard | Visualization and analysis | +| NixOS config | Easy deployment for operators | + +This strategy provides comprehensive observability without requiring a separate database - Prometheus handles all time-series storage and Grafana provides the visualization layer. \ No newline at end of file -- cgit v1.2.3