upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/docs/how-to/migration-scripts/10-check-git-sync.sh
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-23 11:10:55 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-27 20:37:56 +0000
commit800dbfaa82428b897e271d0eb5d9e4c0f107f80b (patch)
tree1d64c3dfc64fcfcf0d034cd46ae7eb11620ccd96 /docs/how-to/migration-scripts/10-check-git-sync.sh
parenta5504395c946bdf28b5ad0e0148ff371ca33d4d3 (diff)
Add Phase 2 migration script for git sync verification
- Compares state event refs to actual git data on disk - Uses git show-ref to handle both loose and packed refs - Outputs TSV format compatible with Phase 3 categorization - Optional --categorize flag for inline categorization - Includes progress indicators and ETA (~20 min runtime on VPS) - Improved error handling and validation over original script
Diffstat (limited to 'docs/how-to/migration-scripts/10-check-git-sync.sh')
-rwxr-xr-xdocs/how-to/migration-scripts/10-check-git-sync.sh557
1 files changed, 557 insertions, 0 deletions
diff --git a/docs/how-to/migration-scripts/10-check-git-sync.sh b/docs/how-to/migration-scripts/10-check-git-sync.sh
new file mode 100755
index 0000000..493d50a
--- /dev/null
+++ b/docs/how-to/migration-scripts/10-check-git-sync.sh
@@ -0,0 +1,557 @@
1#!/usr/bin/env bash
2#
3# 10-check-git-sync.sh - Compare state events to actual git data on disk
4#
5# PHASE 2 of the ngit-relay to ngit-grasp migration analysis pipeline.
6# Compares kind 30618 state events against actual git refs on disk.
7#
8# USAGE:
9# ./10-check-git-sync.sh <state-events.json> <git-base-dir> <output-dir> [--categorize]
10#
11# EXAMPLES:
12# # Check prod relay against prod git data
13# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/ngit-relay/git output/prod
14#
15# # Check archive relay against archive git data
16# ./10-check-git-sync.sh output/archive/raw/state-events.json /var/lib/ngit-relay-archive/git output/archive
17#
18# # Check and categorize in one step (convenience mode)
19# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/ngit-relay/git output/prod --categorize
20#
21# INPUT:
22# state-events.json - JSONL file from Phase 1 (01-fetch-events.sh)
23# One kind 30618 event per line
24# git-base-dir - Base directory containing git repos
25# Structure: <git-base>/<npub>/<repo>.git/
26#
27# OUTPUT:
28# <output-dir>/git-sync-status.tsv - Tab-separated values:
29# repo<TAB>npub<TAB>state_refs<TAB>git_refs<TAB>matches<TAB>reason
30#
31# With --categorize flag, also outputs:
32# <output-dir>/category1-complete-match.txt
33# <output-dir>/category2-empty-blank.txt
34# <output-dir>/category3-partial-match.txt
35# <output-dir>/category4-no-match.txt
36#
37# CATEGORIES:
38# 1. Complete Match - All refs in state event match git data perfectly
39# 2. Empty/Blank - No git data available (directory missing or empty)
40# 3. Partial Match - Some refs match, some don't
41# 4. No Match - Git data exists but commit hashes don't match
42#
43# PREREQUISITES:
44# - nak (for npub encoding) - https://github.com/fiatjaf/nak
45# - jq (for JSON parsing)
46# - Read access to git directories (may need sudo)
47#
48# RUNTIME: ~20 minutes on VPS (git operations are slow)
49#
50# NOTES:
51# - Must run on VPS with access to git directories
52# - Progress indicator updates every 10 events
53# - Handles packed refs (git show-ref) and loose refs
54#
55# SEE ALSO:
56# docs/how-to/migrate-ngit-relay-to-ngit-grasp.md - Full migration guide
57# 01-fetch-events.sh - Phase 1 script that produces input for this script
58# 20-categorize.sh - Phase 3a script that consumes output from this script
59#
60
61set -euo pipefail
62
63# Colors for output (disabled if not a terminal)
64if [[ -t 1 ]]; then
65 RED='\033[0;31m'
66 GREEN='\033[0;32m'
67 YELLOW='\033[0;33m'
68 BLUE='\033[0;34m'
69 NC='\033[0m'
70else
71 RED=''
72 GREEN=''
73 YELLOW=''
74 BLUE=''
75 NC=''
76fi
77
78log_info() {
79 echo -e "${BLUE}[INFO]${NC} $*" >&2
80}
81
82log_success() {
83 echo -e "${GREEN}[OK]${NC} $*" >&2
84}
85
86log_warn() {
87 echo -e "${YELLOW}[WARN]${NC} $*" >&2
88}
89
90log_error() {
91 echo -e "${RED}[ERROR]${NC} $*" >&2
92}
93
94log_progress() {
95 # Overwrite current line for progress updates
96 echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2
97}
98
99usage() {
100 echo "Usage: $0 <state-events.json> <git-base-dir> <output-dir> [--categorize]"
101 echo ""
102 echo "Arguments:"
103 echo " state-events.json JSONL file from Phase 1 (kind 30618 events)"
104 echo " git-base-dir Base directory for git repos (e.g., /var/lib/ngit-relay/git)"
105 echo " output-dir Directory to store output files"
106 echo " --categorize Optional: also output category files (like Phase 3)"
107 echo ""
108 echo "Examples:"
109 echo " $0 output/prod/raw/state-events.json /var/lib/ngit-relay/git output/prod"
110 echo " $0 output/archive/raw/state-events.json /var/lib/ngit-relay-archive/git output/archive"
111 echo ""
112 echo "Output:"
113 echo " git-sync-status.tsv - TSV with: repo, npub, state_refs, git_refs, matches, reason"
114 exit 1
115}
116
117# Check prerequisites
118check_prerequisites() {
119 local missing=0
120
121 if ! command -v nak &> /dev/null; then
122 log_error "nak not found. Install from: https://github.com/fiatjaf/nak"
123 log_error "Or run: nix-shell -p nak jq --run \"$0 $*\""
124 missing=1
125 fi
126
127 if ! command -v jq &> /dev/null; then
128 log_error "jq not found. Install with your package manager."
129 missing=1
130 fi
131
132 if [[ $missing -eq 1 ]]; then
133 exit 1
134 fi
135}
136
137# Convert hex pubkey to npub
138# Args: $1=hex_pubkey
139# Returns: npub string or empty on error
140hex_to_npub() {
141 local hex="$1"
142 nak encode npub "$hex" 2>/dev/null || echo ""
143}
144
145# Count refs in state event (only refs/heads/)
146# Args: $1=event_json
147# Returns: count
148count_state_refs() {
149 local event="$1"
150 echo "$event" | jq '[.tags[] | select(.[0] | startswith("refs/heads/"))] | length' 2>/dev/null || echo "0"
151}
152
153# Get git refs from disk
154# Args: $1=git_dir
155# Returns: count of refs/heads/ refs
156count_git_refs() {
157 local git_dir="$1"
158
159 if [[ ! -d "$git_dir" ]]; then
160 echo "0"
161 return
162 fi
163
164 # Use git show-ref to handle both packed and loose refs
165 # Fall back to counting loose refs if git show-ref fails
166 if git --git-dir="$git_dir" show-ref --heads 2>/dev/null | wc -l | tr -d ' '; then
167 return
168 fi
169
170 # Fallback: count loose refs
171 if [[ -d "$git_dir/refs/heads" ]]; then
172 find "$git_dir/refs/heads" -type f 2>/dev/null | wc -l | tr -d ' '
173 else
174 echo "0"
175 fi
176}
177
178# Get ref hash from git directory
179# Args: $1=git_dir, $2=ref_path (e.g., refs/heads/main)
180# Returns: commit hash or empty
181get_git_ref_hash() {
182 local git_dir="$1"
183 local ref_path="$2"
184
185 # Try git show-ref first (handles packed refs)
186 local hash
187 hash=$(git --git-dir="$git_dir" show-ref --hash "$ref_path" 2>/dev/null | head -1 || echo "")
188
189 if [[ -n "$hash" ]]; then
190 echo "$hash"
191 return
192 fi
193
194 # Fallback: read loose ref file
195 local ref_file="$git_dir/$ref_path"
196 if [[ -f "$ref_file" ]]; then
197 cat "$ref_file" 2>/dev/null | tr -d '\n' || echo ""
198 else
199 echo ""
200 fi
201}
202
203# Compare state event refs to git refs
204# Args: $1=event_json, $2=git_dir
205# Returns: count of matching refs
206count_matching_refs() {
207 local event="$1"
208 local git_dir="$2"
209 local matching=0
210
211 # Extract refs/heads/ tags and compare
212 while IFS= read -r ref_tag; do
213 [[ -z "$ref_tag" ]] && continue
214
215 local ref_path expected_hash
216 ref_path=$(echo "$ref_tag" | jq -r '.[0]' 2>/dev/null || echo "")
217 expected_hash=$(echo "$ref_tag" | jq -r '.[1]' 2>/dev/null || echo "")
218
219 # Skip if not a heads ref or hash is missing
220 [[ ! "$ref_path" =~ ^refs/heads/ ]] && continue
221 [[ -z "$expected_hash" || "$expected_hash" == "null" ]] && continue
222
223 # Get actual hash from git
224 local actual_hash
225 actual_hash=$(get_git_ref_hash "$git_dir" "$ref_path")
226
227 if [[ "$expected_hash" == "$actual_hash" ]]; then
228 matching=$((matching + 1))
229 fi
230 done < <(echo "$event" | jq -c '.tags[] | select(.[0] | startswith("refs/heads/"))' 2>/dev/null)
231
232 echo "$matching"
233}
234
235# Categorize a single entry
236# Args: $1=state_refs, $2=git_refs, $3=matches, $4=reason
237# Returns: category number (1-4)
238categorize_entry() {
239 local state_refs="$1"
240 local git_refs="$2"
241 local matches="$3"
242 local reason="$4"
243
244 # Category 2: Empty/Blank
245 if [[ -n "$reason" ]] || [[ "$git_refs" -eq 0 ]]; then
246 echo "2"
247 return
248 fi
249
250 # Category 1: Complete Match
251 if [[ "$state_refs" -gt 0 ]] && [[ "$state_refs" -eq "$git_refs" ]] && [[ "$matches" -eq "$state_refs" ]]; then
252 echo "1"
253 return
254 fi
255
256 # Category 4: No Match
257 if [[ "$git_refs" -gt 0 ]] && [[ "$matches" -eq 0 ]]; then
258 echo "4"
259 return
260 fi
261
262 # Category 3: Partial Match (default for anything else with matches > 0)
263 if [[ "$matches" -gt 0 ]]; then
264 echo "3"
265 return
266 fi
267
268 # Fallback to category 2
269 echo "2"
270}
271
272# Format entry for category file
273# Args: $1=repo, $2=npub, $3=state_refs, $4=git_refs, $5=matches, $6=reason
274format_category_line() {
275 local repo="$1"
276 local npub="$2"
277 local state_refs="$3"
278 local git_refs="$4"
279 local matches="$5"
280 local reason="$6"
281
282 if [[ -n "$reason" ]]; then
283 echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches | reason=$reason"
284 else
285 echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches"
286 fi
287}
288
289# Process a single state event
290# Args: $1=event_json, $2=git_base
291# Outputs: TSV line to stdout
292process_event() {
293 local event="$1"
294 local git_base="$2"
295
296 # Extract repository identifier (d tag)
297 local identifier
298 identifier=$(echo "$event" | jq -r '.tags[] | select(.[0] == "d") | .[1]' 2>/dev/null | head -1 || echo "")
299
300 if [[ -z "$identifier" ]]; then
301 return 1
302 fi
303
304 # Extract maintainer pubkey (hex)
305 local hex_pubkey
306 hex_pubkey=$(echo "$event" | jq -r '.pubkey' 2>/dev/null || echo "")
307
308 if [[ -z "$hex_pubkey" ]]; then
309 return 1
310 fi
311
312 # Convert to npub
313 local npub
314 npub=$(hex_to_npub "$hex_pubkey")
315
316 if [[ -z "$npub" ]]; then
317 return 1
318 fi
319
320 # Count state refs
321 local state_refs
322 state_refs=$(count_state_refs "$event")
323
324 # Find git directory
325 local git_dir="$git_base/${npub}/${identifier}.git"
326
327 # Check git directory status
328 local git_refs=0
329 local matches=0
330 local reason=""
331
332 if [[ ! -d "$git_dir" ]]; then
333 reason="no_git_dir"
334 elif [[ ! -d "$git_dir/refs/heads" ]] && [[ ! -f "$git_dir/packed-refs" ]]; then
335 reason="empty_refs"
336 else
337 git_refs=$(count_git_refs "$git_dir")
338
339 if [[ "$git_refs" -eq 0 ]]; then
340 reason="empty_refs"
341 elif [[ "$state_refs" -eq 0 ]]; then
342 reason="no_state_refs"
343 else
344 matches=$(count_matching_refs "$event" "$git_dir")
345 fi
346 fi
347
348 # Output TSV line: repo, npub, state_refs, git_refs, matches, reason
349 printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$identifier" "$npub" "$state_refs" "$git_refs" "$matches" "$reason"
350}
351
352# Main
353main() {
354 local do_categorize=0
355 local args=()
356
357 # Parse arguments
358 for arg in "$@"; do
359 if [[ "$arg" == "--categorize" ]]; then
360 do_categorize=1
361 else
362 args+=("$arg")
363 fi
364 done
365
366 if [[ ${#args[@]} -ne 3 ]]; then
367 usage
368 fi
369
370 local state_events_file="${args[0]}"
371 local git_base="${args[1]}"
372 local output_dir="${args[2]}"
373
374 # Validate inputs
375 if [[ ! -f "$state_events_file" ]]; then
376 log_error "State events file not found: $state_events_file"
377 exit 1
378 fi
379
380 if [[ ! -d "$git_base" ]]; then
381 log_error "Git base directory not found: $git_base"
382 log_error "This script must run on the VPS with access to git directories."
383 exit 1
384 fi
385
386 # Check read permissions
387 if ! ls "$git_base" >/dev/null 2>&1; then
388 log_error "Cannot read git base directory (permission denied): $git_base"
389 log_error "Try running with sudo or grant read permissions."
390 exit 1
391 fi
392
393 check_prerequisites
394
395 log_info "=== Git State Synchronization Check ==="
396 log_info "State events: $state_events_file"
397 log_info "Git base: $git_base"
398 log_info "Output: $output_dir"
399 if [[ $do_categorize -eq 1 ]]; then
400 log_info "Mode: TSV + categorization"
401 else
402 log_info "Mode: TSV only (use 20-categorize.sh for categories)"
403 fi
404 log_info "Started: $(date)"
405 echo ""
406
407 # Create output directory
408 mkdir -p "$output_dir"
409
410 # Output files
411 local tsv_file="$output_dir/git-sync-status.tsv"
412
413 # Initialize TSV with header
414 echo -e "repo\tnpub\tstate_refs\tgit_refs\tmatches\treason" > "$tsv_file"
415
416 # Initialize category files if categorizing
417 local cat1="" cat2="" cat3="" cat4=""
418 if [[ $do_categorize -eq 1 ]]; then
419 cat1="$output_dir/category1-complete-match.txt"
420 cat2="$output_dir/category2-empty-blank.txt"
421 cat3="$output_dir/category3-partial-match.txt"
422 cat4="$output_dir/category4-no-match.txt"
423 > "$cat1"
424 > "$cat2"
425 > "$cat3"
426 > "$cat4"
427 fi
428
429 # Count total events
430 local total_events
431 total_events=$(wc -l < "$state_events_file" | tr -d ' ')
432 log_info "Processing $total_events state events..."
433 echo ""
434
435 # Process each event
436 local count=0
437 local processed=0
438 local skipped=0
439 local count_cat1=0 count_cat2=0 count_cat3=0 count_cat4=0
440 local start_time
441 start_time=$(date +%s)
442
443 while IFS= read -r event; do
444 count=$((count + 1))
445
446 # Skip empty lines
447 [[ -z "$event" ]] && continue
448
449 # Process event
450 local result
451 if result=$(process_event "$event" "$git_base"); then
452 processed=$((processed + 1))
453
454 # Write to TSV (skip header line)
455 echo "$result" >> "$tsv_file"
456
457 # Categorize if requested
458 if [[ $do_categorize -eq 1 ]]; then
459 # Parse result
460 IFS=$'\t' read -r repo npub state_refs git_refs matches reason <<< "$result"
461
462 local category
463 category=$(categorize_entry "$state_refs" "$git_refs" "$matches" "$reason")
464
465 local cat_line
466 cat_line=$(format_category_line "$repo" "$npub" "$state_refs" "$git_refs" "$matches" "$reason")
467
468 case "$category" in
469 1) echo "$cat_line" >> "$cat1"; count_cat1=$((count_cat1 + 1)) ;;
470 2) echo "$cat_line" >> "$cat2"; count_cat2=$((count_cat2 + 1)) ;;
471 3) echo "$cat_line" >> "$cat3"; count_cat3=$((count_cat3 + 1)) ;;
472 4) echo "$cat_line" >> "$cat4"; count_cat4=$((count_cat4 + 1)) ;;
473 esac
474 fi
475 else
476 skipped=$((skipped + 1))
477 fi
478
479 # Progress indicator every 10 events
480 if [[ $((count % 10)) -eq 0 ]]; then
481 local elapsed=$(($(date +%s) - start_time))
482 local rate=0
483 if [[ $elapsed -gt 0 ]]; then
484 rate=$((count / elapsed))
485 fi
486 local eta="?"
487 if [[ $rate -gt 0 ]]; then
488 eta=$(( (total_events - count) / rate ))
489 fi
490 log_progress "Processed $count/$total_events events (~${rate}/s, ETA: ${eta}s)..."
491 fi
492 done < "$state_events_file"
493
494 # Clear progress line
495 echo "" >&2
496
497 local end_time
498 end_time=$(date +%s)
499 local duration=$((end_time - start_time))
500
501 # Summary
502 echo ""
503 log_info "=== Analysis Complete ==="
504 log_info "Finished: $(date)"
505 log_info "Duration: ${duration}s"
506 log_info "Processed: $processed events"
507 if [[ $skipped -gt 0 ]]; then
508 log_warn "Skipped: $skipped events (missing identifier or pubkey)"
509 fi
510 echo ""
511
512 if [[ $do_categorize -eq 1 ]]; then
513 # Calculate percentages
514 local total=$((count_cat1 + count_cat2 + count_cat3 + count_cat4))
515 local pct1=0 pct2=0 pct3=0 pct4=0
516 if [[ $total -gt 0 ]]; then
517 pct1=$(awk "BEGIN {printf \"%.1f\", ($count_cat1/$total)*100}")
518 pct2=$(awk "BEGIN {printf \"%.1f\", ($count_cat2/$total)*100}")
519 pct3=$(awk "BEGIN {printf \"%.1f\", ($count_cat3/$total)*100}")
520 pct4=$(awk "BEGIN {printf \"%.1f\", ($count_cat4/$total)*100}")
521 fi
522
523 log_info "=== Category Summary ==="
524 log_success "Category 1 (Complete Match): $count_cat1 ($pct1%)"
525 log_warn "Category 2 (Empty/Blank): $count_cat2 ($pct2%)"
526 log_warn "Category 3 (Partial Match): $count_cat3 ($pct3%)"
527 log_error "Category 4 (No Match): $count_cat4 ($pct4%)"
528 echo ""
529
530 # Validation warning
531 if [[ $count_cat2 -eq $total ]] && [[ $total -gt 0 ]]; then
532 log_error "WARNING: 100% of repos categorized as Empty/Blank"
533 log_error "This usually indicates a permission or path issue."
534 echo ""
535 log_info "Troubleshooting:"
536 echo " 1. Verify git data exists: sudo ls -la $git_base | head -10"
537 echo " 2. Check sample repo: sudo find $git_base -name '*.git' -type d | head -1"
538 echo " 3. Re-run with sudo if not already using it"
539 echo ""
540 fi
541 fi
542
543 log_info "Output files:"
544 echo " $tsv_file"
545 if [[ $do_categorize -eq 1 ]]; then
546 echo " $cat1"
547 echo " $cat2"
548 echo " $cat3"
549 echo " $cat4"
550 else
551 echo ""
552 log_info "Next step: Run 20-categorize.sh to categorize results"
553 echo " ./20-categorize.sh $tsv_file $output_dir"
554 fi
555}
556
557main "$@"