From 800dbfaa82428b897e271d0eb5d9e4c0f107f80b Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Fri, 23 Jan 2026 11:10:55 +0000 Subject: Add Phase 2 migration script for git sync verification - Compares state event refs to actual git data on disk - Uses git show-ref to handle both loose and packed refs - Outputs TSV format compatible with Phase 3 categorization - Optional --categorize flag for inline categorization - Includes progress indicators and ETA (~20 min runtime on VPS) - Improved error handling and validation over original script --- docs/how-to/migration-scripts/10-check-git-sync.sh | 557 +++++++++++++++++++++ 1 file changed, 557 insertions(+) create mode 100755 docs/how-to/migration-scripts/10-check-git-sync.sh (limited to 'docs/how-to/migration-scripts/10-check-git-sync.sh') diff --git a/docs/how-to/migration-scripts/10-check-git-sync.sh b/docs/how-to/migration-scripts/10-check-git-sync.sh new file mode 100755 index 0000000..493d50a --- /dev/null +++ b/docs/how-to/migration-scripts/10-check-git-sync.sh @@ -0,0 +1,557 @@ +#!/usr/bin/env bash +# +# 10-check-git-sync.sh - Compare state events to actual git data on disk +# +# PHASE 2 of the ngit-relay to ngit-grasp migration analysis pipeline. +# Compares kind 30618 state events against actual git refs on disk. +# +# USAGE: +# ./10-check-git-sync.sh [--categorize] +# +# EXAMPLES: +# # Check prod relay against prod git data +# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/ngit-relay/git output/prod +# +# # Check archive relay against archive git data +# ./10-check-git-sync.sh output/archive/raw/state-events.json /var/lib/ngit-relay-archive/git output/archive +# +# # Check and categorize in one step (convenience mode) +# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/ngit-relay/git output/prod --categorize +# +# INPUT: +# state-events.json - JSONL file from Phase 1 (01-fetch-events.sh) +# One kind 30618 event per line +# git-base-dir - Base directory containing git repos +# Structure: //.git/ +# +# OUTPUT: +# /git-sync-status.tsv - Tab-separated values: +# reponpubstate_refsgit_refsmatchesreason +# +# With --categorize flag, also outputs: +# /category1-complete-match.txt +# /category2-empty-blank.txt +# /category3-partial-match.txt +# /category4-no-match.txt +# +# CATEGORIES: +# 1. Complete Match - All refs in state event match git data perfectly +# 2. Empty/Blank - No git data available (directory missing or empty) +# 3. Partial Match - Some refs match, some don't +# 4. No Match - Git data exists but commit hashes don't match +# +# PREREQUISITES: +# - nak (for npub encoding) - https://github.com/fiatjaf/nak +# - jq (for JSON parsing) +# - Read access to git directories (may need sudo) +# +# RUNTIME: ~20 minutes on VPS (git operations are slow) +# +# NOTES: +# - Must run on VPS with access to git directories +# - Progress indicator updates every 10 events +# - Handles packed refs (git show-ref) and loose refs +# +# SEE ALSO: +# docs/how-to/migrate-ngit-relay-to-ngit-grasp.md - Full migration guide +# 01-fetch-events.sh - Phase 1 script that produces input for this script +# 20-categorize.sh - Phase 3a script that consumes output from this script +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_progress() { + # Overwrite current line for progress updates + echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 [--categorize]" + echo "" + echo "Arguments:" + echo " state-events.json JSONL file from Phase 1 (kind 30618 events)" + echo " git-base-dir Base directory for git repos (e.g., /var/lib/ngit-relay/git)" + echo " output-dir Directory to store output files" + echo " --categorize Optional: also output category files (like Phase 3)" + echo "" + echo "Examples:" + echo " $0 output/prod/raw/state-events.json /var/lib/ngit-relay/git output/prod" + echo " $0 output/archive/raw/state-events.json /var/lib/ngit-relay-archive/git output/archive" + echo "" + echo "Output:" + echo " git-sync-status.tsv - TSV with: repo, npub, state_refs, git_refs, matches, reason" + exit 1 +} + +# Check prerequisites +check_prerequisites() { + local missing=0 + + if ! command -v nak &> /dev/null; then + log_error "nak not found. Install from: https://github.com/fiatjaf/nak" + log_error "Or run: nix-shell -p nak jq --run \"$0 $*\"" + missing=1 + fi + + if ! command -v jq &> /dev/null; then + log_error "jq not found. Install with your package manager." + missing=1 + fi + + if [[ $missing -eq 1 ]]; then + exit 1 + fi +} + +# Convert hex pubkey to npub +# Args: $1=hex_pubkey +# Returns: npub string or empty on error +hex_to_npub() { + local hex="$1" + nak encode npub "$hex" 2>/dev/null || echo "" +} + +# Count refs in state event (only refs/heads/) +# Args: $1=event_json +# Returns: count +count_state_refs() { + local event="$1" + echo "$event" | jq '[.tags[] | select(.[0] | startswith("refs/heads/"))] | length' 2>/dev/null || echo "0" +} + +# Get git refs from disk +# Args: $1=git_dir +# Returns: count of refs/heads/ refs +count_git_refs() { + local git_dir="$1" + + if [[ ! -d "$git_dir" ]]; then + echo "0" + return + fi + + # Use git show-ref to handle both packed and loose refs + # Fall back to counting loose refs if git show-ref fails + if git --git-dir="$git_dir" show-ref --heads 2>/dev/null | wc -l | tr -d ' '; then + return + fi + + # Fallback: count loose refs + if [[ -d "$git_dir/refs/heads" ]]; then + find "$git_dir/refs/heads" -type f 2>/dev/null | wc -l | tr -d ' ' + else + echo "0" + fi +} + +# Get ref hash from git directory +# Args: $1=git_dir, $2=ref_path (e.g., refs/heads/main) +# Returns: commit hash or empty +get_git_ref_hash() { + local git_dir="$1" + local ref_path="$2" + + # Try git show-ref first (handles packed refs) + local hash + hash=$(git --git-dir="$git_dir" show-ref --hash "$ref_path" 2>/dev/null | head -1 || echo "") + + if [[ -n "$hash" ]]; then + echo "$hash" + return + fi + + # Fallback: read loose ref file + local ref_file="$git_dir/$ref_path" + if [[ -f "$ref_file" ]]; then + cat "$ref_file" 2>/dev/null | tr -d '\n' || echo "" + else + echo "" + fi +} + +# Compare state event refs to git refs +# Args: $1=event_json, $2=git_dir +# Returns: count of matching refs +count_matching_refs() { + local event="$1" + local git_dir="$2" + local matching=0 + + # Extract refs/heads/ tags and compare + while IFS= read -r ref_tag; do + [[ -z "$ref_tag" ]] && continue + + local ref_path expected_hash + ref_path=$(echo "$ref_tag" | jq -r '.[0]' 2>/dev/null || echo "") + expected_hash=$(echo "$ref_tag" | jq -r '.[1]' 2>/dev/null || echo "") + + # Skip if not a heads ref or hash is missing + [[ ! "$ref_path" =~ ^refs/heads/ ]] && continue + [[ -z "$expected_hash" || "$expected_hash" == "null" ]] && continue + + # Get actual hash from git + local actual_hash + actual_hash=$(get_git_ref_hash "$git_dir" "$ref_path") + + if [[ "$expected_hash" == "$actual_hash" ]]; then + matching=$((matching + 1)) + fi + done < <(echo "$event" | jq -c '.tags[] | select(.[0] | startswith("refs/heads/"))' 2>/dev/null) + + echo "$matching" +} + +# Categorize a single entry +# Args: $1=state_refs, $2=git_refs, $3=matches, $4=reason +# Returns: category number (1-4) +categorize_entry() { + local state_refs="$1" + local git_refs="$2" + local matches="$3" + local reason="$4" + + # Category 2: Empty/Blank + if [[ -n "$reason" ]] || [[ "$git_refs" -eq 0 ]]; then + echo "2" + return + fi + + # Category 1: Complete Match + if [[ "$state_refs" -gt 0 ]] && [[ "$state_refs" -eq "$git_refs" ]] && [[ "$matches" -eq "$state_refs" ]]; then + echo "1" + return + fi + + # Category 4: No Match + if [[ "$git_refs" -gt 0 ]] && [[ "$matches" -eq 0 ]]; then + echo "4" + return + fi + + # Category 3: Partial Match (default for anything else with matches > 0) + if [[ "$matches" -gt 0 ]]; then + echo "3" + return + fi + + # Fallback to category 2 + echo "2" +} + +# Format entry for category file +# Args: $1=repo, $2=npub, $3=state_refs, $4=git_refs, $5=matches, $6=reason +format_category_line() { + local repo="$1" + local npub="$2" + local state_refs="$3" + local git_refs="$4" + local matches="$5" + local reason="$6" + + if [[ -n "$reason" ]]; then + echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches | reason=$reason" + else + echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches" + fi +} + +# Process a single state event +# Args: $1=event_json, $2=git_base +# Outputs: TSV line to stdout +process_event() { + local event="$1" + local git_base="$2" + + # Extract repository identifier (d tag) + local identifier + identifier=$(echo "$event" | jq -r '.tags[] | select(.[0] == "d") | .[1]' 2>/dev/null | head -1 || echo "") + + if [[ -z "$identifier" ]]; then + return 1 + fi + + # Extract maintainer pubkey (hex) + local hex_pubkey + hex_pubkey=$(echo "$event" | jq -r '.pubkey' 2>/dev/null || echo "") + + if [[ -z "$hex_pubkey" ]]; then + return 1 + fi + + # Convert to npub + local npub + npub=$(hex_to_npub "$hex_pubkey") + + if [[ -z "$npub" ]]; then + return 1 + fi + + # Count state refs + local state_refs + state_refs=$(count_state_refs "$event") + + # Find git directory + local git_dir="$git_base/${npub}/${identifier}.git" + + # Check git directory status + local git_refs=0 + local matches=0 + local reason="" + + if [[ ! -d "$git_dir" ]]; then + reason="no_git_dir" + elif [[ ! -d "$git_dir/refs/heads" ]] && [[ ! -f "$git_dir/packed-refs" ]]; then + reason="empty_refs" + else + git_refs=$(count_git_refs "$git_dir") + + if [[ "$git_refs" -eq 0 ]]; then + reason="empty_refs" + elif [[ "$state_refs" -eq 0 ]]; then + reason="no_state_refs" + else + matches=$(count_matching_refs "$event" "$git_dir") + fi + fi + + # Output TSV line: repo, npub, state_refs, git_refs, matches, reason + printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$identifier" "$npub" "$state_refs" "$git_refs" "$matches" "$reason" +} + +# Main +main() { + local do_categorize=0 + local args=() + + # Parse arguments + for arg in "$@"; do + if [[ "$arg" == "--categorize" ]]; then + do_categorize=1 + else + args+=("$arg") + fi + done + + if [[ ${#args[@]} -ne 3 ]]; then + usage + fi + + local state_events_file="${args[0]}" + local git_base="${args[1]}" + local output_dir="${args[2]}" + + # Validate inputs + if [[ ! -f "$state_events_file" ]]; then + log_error "State events file not found: $state_events_file" + exit 1 + fi + + if [[ ! -d "$git_base" ]]; then + log_error "Git base directory not found: $git_base" + log_error "This script must run on the VPS with access to git directories." + exit 1 + fi + + # Check read permissions + if ! ls "$git_base" >/dev/null 2>&1; then + log_error "Cannot read git base directory (permission denied): $git_base" + log_error "Try running with sudo or grant read permissions." + exit 1 + fi + + check_prerequisites + + log_info "=== Git State Synchronization Check ===" + log_info "State events: $state_events_file" + log_info "Git base: $git_base" + log_info "Output: $output_dir" + if [[ $do_categorize -eq 1 ]]; then + log_info "Mode: TSV + categorization" + else + log_info "Mode: TSV only (use 20-categorize.sh for categories)" + fi + log_info "Started: $(date)" + echo "" + + # Create output directory + mkdir -p "$output_dir" + + # Output files + local tsv_file="$output_dir/git-sync-status.tsv" + + # Initialize TSV with header + echo -e "repo\tnpub\tstate_refs\tgit_refs\tmatches\treason" > "$tsv_file" + + # Initialize category files if categorizing + local cat1="" cat2="" cat3="" cat4="" + if [[ $do_categorize -eq 1 ]]; then + cat1="$output_dir/category1-complete-match.txt" + cat2="$output_dir/category2-empty-blank.txt" + cat3="$output_dir/category3-partial-match.txt" + cat4="$output_dir/category4-no-match.txt" + > "$cat1" + > "$cat2" + > "$cat3" + > "$cat4" + fi + + # Count total events + local total_events + total_events=$(wc -l < "$state_events_file" | tr -d ' ') + log_info "Processing $total_events state events..." + echo "" + + # Process each event + local count=0 + local processed=0 + local skipped=0 + local count_cat1=0 count_cat2=0 count_cat3=0 count_cat4=0 + local start_time + start_time=$(date +%s) + + while IFS= read -r event; do + count=$((count + 1)) + + # Skip empty lines + [[ -z "$event" ]] && continue + + # Process event + local result + if result=$(process_event "$event" "$git_base"); then + processed=$((processed + 1)) + + # Write to TSV (skip header line) + echo "$result" >> "$tsv_file" + + # Categorize if requested + if [[ $do_categorize -eq 1 ]]; then + # Parse result + IFS=$'\t' read -r repo npub state_refs git_refs matches reason <<< "$result" + + local category + category=$(categorize_entry "$state_refs" "$git_refs" "$matches" "$reason") + + local cat_line + cat_line=$(format_category_line "$repo" "$npub" "$state_refs" "$git_refs" "$matches" "$reason") + + case "$category" in + 1) echo "$cat_line" >> "$cat1"; count_cat1=$((count_cat1 + 1)) ;; + 2) echo "$cat_line" >> "$cat2"; count_cat2=$((count_cat2 + 1)) ;; + 3) echo "$cat_line" >> "$cat3"; count_cat3=$((count_cat3 + 1)) ;; + 4) echo "$cat_line" >> "$cat4"; count_cat4=$((count_cat4 + 1)) ;; + esac + fi + else + skipped=$((skipped + 1)) + fi + + # Progress indicator every 10 events + if [[ $((count % 10)) -eq 0 ]]; then + local elapsed=$(($(date +%s) - start_time)) + local rate=0 + if [[ $elapsed -gt 0 ]]; then + rate=$((count / elapsed)) + fi + local eta="?" + if [[ $rate -gt 0 ]]; then + eta=$(( (total_events - count) / rate )) + fi + log_progress "Processed $count/$total_events events (~${rate}/s, ETA: ${eta}s)..." + fi + done < "$state_events_file" + + # Clear progress line + echo "" >&2 + + local end_time + end_time=$(date +%s) + local duration=$((end_time - start_time)) + + # Summary + echo "" + log_info "=== Analysis Complete ===" + log_info "Finished: $(date)" + log_info "Duration: ${duration}s" + log_info "Processed: $processed events" + if [[ $skipped -gt 0 ]]; then + log_warn "Skipped: $skipped events (missing identifier or pubkey)" + fi + echo "" + + if [[ $do_categorize -eq 1 ]]; then + # Calculate percentages + local total=$((count_cat1 + count_cat2 + count_cat3 + count_cat4)) + local pct1=0 pct2=0 pct3=0 pct4=0 + if [[ $total -gt 0 ]]; then + pct1=$(awk "BEGIN {printf \"%.1f\", ($count_cat1/$total)*100}") + pct2=$(awk "BEGIN {printf \"%.1f\", ($count_cat2/$total)*100}") + pct3=$(awk "BEGIN {printf \"%.1f\", ($count_cat3/$total)*100}") + pct4=$(awk "BEGIN {printf \"%.1f\", ($count_cat4/$total)*100}") + fi + + log_info "=== Category Summary ===" + log_success "Category 1 (Complete Match): $count_cat1 ($pct1%)" + log_warn "Category 2 (Empty/Blank): $count_cat2 ($pct2%)" + log_warn "Category 3 (Partial Match): $count_cat3 ($pct3%)" + log_error "Category 4 (No Match): $count_cat4 ($pct4%)" + echo "" + + # Validation warning + if [[ $count_cat2 -eq $total ]] && [[ $total -gt 0 ]]; then + log_error "WARNING: 100% of repos categorized as Empty/Blank" + log_error "This usually indicates a permission or path issue." + echo "" + log_info "Troubleshooting:" + echo " 1. Verify git data exists: sudo ls -la $git_base | head -10" + echo " 2. Check sample repo: sudo find $git_base -name '*.git' -type d | head -1" + echo " 3. Re-run with sudo if not already using it" + echo "" + fi + fi + + log_info "Output files:" + echo " $tsv_file" + if [[ $do_categorize -eq 1 ]]; then + echo " $cat1" + echo " $cat2" + echo " $cat3" + echo " $cat4" + else + echo "" + log_info "Next step: Run 20-categorize.sh to categorize results" + echo " ./20-categorize.sh $tsv_file $output_dir" + fi +} + +main "$@" -- cgit v1.2.3