From 92a9a3bfe0bc522e8ae411991a366a3a6310d525 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Tue, 3 Feb 2026 14:41:46 +0000 Subject: docs: archive relay.ngit.dev migration materials for reference Move migration guide and scripts to docs/archive/2026-01-relay-ngit-dev-migration/ with clear warnings that these are reference-only materials from a specific migration context, not general-purpose tools. These materials document the relay.ngit.dev migration from ngit-relay to ngit-grasp in January 2026. The scripts were developed iteratively during the migration and are specific to that context. They are preserved for: - Historical reference - Context for production fixes in this branch - Inspiration for future migrations (not direct reuse) The migration uncovered critical bugs now fixed in this branch: - Git protocol error handling - Naughty list false positives - Purgatory event tracking - Sync startup issues - Configuration management --- .../scripts/01-fetch-events.sh | 206 ++++++ .../scripts/10-check-git-sync.sh | 564 +++++++++++++++ .../scripts/20-categorize.sh | 212 ++++++ .../scripts/21-compare-relays.sh | 294 ++++++++ .../scripts/22-compare-git-data.sh | 390 +++++++++++ .../scripts/30-extract-parse-failures.sh | 774 ++++++++++++++++++++ .../scripts/31-extract-purgatory-expiry.sh | 408 +++++++++++ .../scripts/40-classify-actions.sh | 662 +++++++++++++++++ .../scripts/run-migration-analysis.sh | 779 +++++++++++++++++++++ .../scripts/validate-service.sh | 151 ++++ 10 files changed, 4440 insertions(+) create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/01-fetch-events.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/10-check-git-sync.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/20-categorize.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/21-compare-relays.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/30-extract-parse-failures.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/31-extract-purgatory-expiry.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/40-classify-actions.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/run-migration-analysis.sh create mode 100755 docs/archive/2026-01-relay-ngit-dev-migration/scripts/validate-service.sh (limited to 'docs/archive/2026-01-relay-ngit-dev-migration/scripts') diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/01-fetch-events.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/01-fetch-events.sh new file mode 100755 index 0000000..e0d6f26 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/01-fetch-events.sh @@ -0,0 +1,206 @@ +#!/usr/bin/env bash +# +# 01-fetch-events.sh - Fetch nostr events from a relay for migration analysis +# +# PHASE 1 of the GRASP relay to ngit-grasp migration analysis pipeline. +# Fetches kind 30618 (state), 30617 (announcement), and 5 (deletion) events. +# +# USAGE: +# ./01-fetch-events.sh +# +# EXAMPLES: +# # Fetch from production relay +# ./01-fetch-events.sh wss://relay.ngit.dev output/prod +# +# # Fetch from archive relay +# ./01-fetch-events.sh wss://archive.relay.ngit.dev output/archive +# +# # Full migration analysis setup +# mkdir -p work/migration-analysis-$(date +%Y%m%d-%H%M) +# ./01-fetch-events.sh wss://relay.ngit.dev work/migration-analysis-*/prod +# ./01-fetch-events.sh wss://archive.relay.ngit.dev work/migration-analysis-*/archive +# +# OUTPUT: +# /raw/state-events.json - kind 30618 events (one per line, JSONL) +# /raw/announcements.json - kind 30617 events (one per line, JSONL) +# /raw/deletions.json - kind 5 events (one per line, JSONL) +# +# OUTPUT FORMAT: +# Each file contains one JSON event per line (JSONL format). +# Events are the raw nostr event objects as returned by the relay. +# +# PREREQUISITES: +# - nak (Nostr Army Knife) - https://github.com/fiatjaf/nak +# - jq (for counting/validation) +# +# RUNTIME: ~30 seconds per relay (depends on network and event count) +# +# NOTES: +# - Uses --paginate to ensure all events are fetched (not just first page) +# - If event counts are exact multiples of 250, pagination may have failed +# - Run Phase 1 and Phase 2 back-to-back for accurate snapshot +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' # No Color +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Arguments:" + echo " relay-url WebSocket URL of the relay (e.g., wss://relay.ngit.dev)" + echo " output-dir Directory to store fetched events (e.g., output/prod)" + echo "" + echo "Examples:" + echo " $0 wss://relay.ngit.dev output/prod" + echo " $0 wss://archive.relay.ngit.dev output/archive" + exit 1 +} + +# Check prerequisites +check_prerequisites() { + local missing=0 + + if ! command -v nak &> /dev/null; then + log_error "nak not found. Install from: https://github.com/fiatjaf/nak" + missing=1 + fi + + if ! command -v jq &> /dev/null; then + log_error "jq not found. Install with your package manager." + missing=1 + fi + + if [[ $missing -eq 1 ]]; then + exit 1 + fi +} + +# Fetch events of a specific kind +# Args: $1=relay, $2=kind, $3=output_file, $4=description +fetch_kind() { + local relay="$1" + local kind="$2" + local output_file="$3" + local description="$4" + + log_info "Fetching $description (kind $kind) from $relay..." + + local start_time + start_time=$(date +%s) + + # Use --paginate to ensure we get all events, not just first page + # nak outputs one event per line (JSONL format) + if ! nak req -k "$kind" --paginate "$relay" > "$output_file" 2>/dev/null; then + log_error "Failed to fetch $description from $relay" + return 1 + fi + + local end_time + end_time=$(date +%s) + local duration=$((end_time - start_time)) + + # Count events + local count + count=$(wc -l < "$output_file" | tr -d ' ') + + # Warn if count is suspicious (exact multiple of 250 suggests pagination issue) + if [[ $count -gt 0 ]] && [[ $((count % 250)) -eq 0 ]]; then + log_warn "$description count ($count) is exact multiple of 250 - pagination may have failed!" + fi + + log_success "Fetched $count $description in ${duration}s -> $output_file" + + echo "$count" +} + +# Main +main() { + if [[ $# -ne 2 ]]; then + usage + fi + + local relay="$1" + local output_dir="$2" + + # Validate relay URL + if [[ ! "$relay" =~ ^wss?:// ]]; then + log_error "Invalid relay URL: $relay (must start with ws:// or wss://)" + exit 1 + fi + + check_prerequisites + + log_info "Starting event fetch from $relay" + log_info "Output directory: $output_dir" + + # Create output directory structure + local raw_dir="$output_dir/raw" + mkdir -p "$raw_dir" + + local total_start + total_start=$(date +%s) + + # Fetch each event type + local state_count announcement_count deletion_count + + state_count=$(fetch_kind "$relay" 30618 "$raw_dir/state-events.json" "state events") + announcement_count=$(fetch_kind "$relay" 30617 "$raw_dir/announcements.json" "announcements") + deletion_count=$(fetch_kind "$relay" 5 "$raw_dir/deletions.json" "deletion requests") + + local total_end + total_end=$(date +%s) + local total_duration=$((total_end - total_start)) + + # Summary + echo "" + log_info "=== Fetch Summary ===" + log_info "Relay: $relay" + log_info "Output: $output_dir" + log_info "State events (30618): $state_count" + log_info "Announcements (30617): $announcement_count" + log_info "Deletions (5): $deletion_count" + log_info "Total time: ${total_duration}s" + echo "" + + # Output file listing for easy copy/paste + log_info "Output files:" + echo " $raw_dir/state-events.json" + echo " $raw_dir/announcements.json" + echo " $raw_dir/deletions.json" +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/10-check-git-sync.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/10-check-git-sync.sh new file mode 100755 index 0000000..b4536cb --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/10-check-git-sync.sh @@ -0,0 +1,564 @@ +#!/usr/bin/env bash +# +# 10-check-git-sync.sh - Compare state events to actual git data on disk +# +# PHASE 2 of the GRASP relay to ngit-grasp migration analysis pipeline. +# Compares kind 30618 state events against actual git refs on disk. +# +# USAGE: +# ./10-check-git-sync.sh [--categorize] +# +# EXAMPLES: +# # Check source relay against source git data +# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/grasp-relay/git output/prod +# +# # Check target relay against target git data +# ./10-check-git-sync.sh output/archive/raw/state-events.json /var/lib/ngit-grasp/git output/archive +# +# # Check and categorize in one step (convenience mode) +# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/grasp-relay/git output/prod --categorize +# +# INPUT: +# state-events.json - JSONL file from Phase 1 (01-fetch-events.sh) +# One kind 30618 event per line +# git-base-dir - Base directory containing git repos +# Structure: //.git/ +# +# OUTPUT: +# /git-sync-status.tsv - Tab-separated values: +# reponpubstate_refsgit_refsmatchesreason +# +# With --categorize flag, also outputs: +# /category1-complete-match.txt +# /category2-empty-blank.txt +# /category3-partial-match.txt +# /category4-no-match.txt +# +# CATEGORIES: +# 1. Complete Match - All refs in state event match git data perfectly +# 2. Empty/Blank - No git data available (directory missing or empty) +# 3. Partial Match - Some refs match, some don't +# 4. No Match - Git data exists but commit hashes don't match +# +# PREREQUISITES: +# - nak (for npub encoding) - https://github.com/fiatjaf/nak +# - jq (for JSON parsing) +# - Read access to git directories (may need sudo) +# +# RUNTIME: ~20 minutes on VPS (git operations are slow) +# +# NOTES: +# - Must run on VPS with access to git directories +# - Progress indicator updates every 10 events +# - Handles packed refs (git show-ref) and loose refs +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 01-fetch-events.sh - Phase 1 script that produces input for this script +# 20-categorize.sh - Phase 3a script that consumes output from this script +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_progress() { + # Overwrite current line for progress updates + echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 [--categorize]" + echo "" + echo "Arguments:" + echo " state-events.json JSONL file from Phase 1 (kind 30618 events)" + echo " git-base-dir Base directory for git repos (e.g., /var/lib/grasp-relay/git)" + echo " output-dir Directory to store output files" + echo " --categorize Optional: also output category files (like Phase 3)" + echo "" + echo "Examples:" + echo " $0 output/prod/raw/state-events.json /var/lib/grasp-relay/git output/prod" + echo " $0 output/archive/raw/state-events.json /var/lib/ngit-grasp/git output/archive" + echo "" + echo "Output:" + echo " git-sync-status.tsv - TSV with: repo, npub, state_refs, git_refs, matches, reason" + exit 1 +} + +# Check prerequisites +check_prerequisites() { + local missing=0 + + if ! command -v git &> /dev/null; then + log_error "git not found. Install with your package manager." + missing=1 + fi + + if ! command -v nak &> /dev/null; then + log_error "nak not found. Install from: https://github.com/fiatjaf/nak" + log_error "Or run: nix-shell -p nak jq --run \"$0 $*\"" + missing=1 + fi + + if ! command -v jq &> /dev/null; then + log_error "jq not found. Install with your package manager." + missing=1 + fi + + if [[ $missing -eq 1 ]]; then + exit 1 + fi +} + +# Convert hex pubkey to npub +# Args: $1=hex_pubkey +# Returns: npub string or empty on error +hex_to_npub() { + local hex="$1" + nak encode npub "$hex" 2>/dev/null || echo "" +} + +# Count refs in state event (only refs/heads/) +# Args: $1=event_json +# Returns: count +count_state_refs() { + local event="$1" + echo "$event" | jq '[.tags[] | select(.[0] | startswith("refs/heads/"))] | length' 2>/dev/null || echo "0" +} + +# Get git refs from disk +# Args: $1=git_dir +# Returns: count of refs/heads/ refs +count_git_refs() { + local git_dir="$1" + + if [[ ! -d "$git_dir" ]]; then + echo "0" + return + fi + + # Try git show-ref first (handles packed refs correctly) + # Note: We capture output separately to avoid pipefail issues + local count + if count=$(git --git-dir="$git_dir" show-ref --heads 2>/dev/null | wc -l); then + echo "$count" | tr -d ' ' + return + fi + + # Fallback: count loose refs (when git is not available or fails) + if [[ -d "$git_dir/refs/heads" ]]; then + find "$git_dir/refs/heads" -type f 2>/dev/null | wc -l | tr -d ' ' + else + echo "0" + fi +} + +# Get ref hash from git directory +# Args: $1=git_dir, $2=ref_path (e.g., refs/heads/main) +# Returns: commit hash or empty +get_git_ref_hash() { + local git_dir="$1" + local ref_path="$2" + + # Try git show-ref first (handles packed refs) + local hash + hash=$(git --git-dir="$git_dir" show-ref --hash "$ref_path" 2>/dev/null | head -1 || echo "") + + if [[ -n "$hash" ]]; then + echo "$hash" + return + fi + + # Fallback: read loose ref file + local ref_file="$git_dir/$ref_path" + if [[ -f "$ref_file" ]]; then + cat "$ref_file" 2>/dev/null | tr -d '\n' || echo "" + else + echo "" + fi +} + +# Compare state event refs to git refs +# Args: $1=event_json, $2=git_dir +# Returns: count of matching refs +count_matching_refs() { + local event="$1" + local git_dir="$2" + local matching=0 + + # Extract refs/heads/ tags and compare + while IFS= read -r ref_tag; do + [[ -z "$ref_tag" ]] && continue + + local ref_path expected_hash + ref_path=$(echo "$ref_tag" | jq -r '.[0]' 2>/dev/null || echo "") + expected_hash=$(echo "$ref_tag" | jq -r '.[1]' 2>/dev/null || echo "") + + # Skip if not a heads ref or hash is missing + [[ ! "$ref_path" =~ ^refs/heads/ ]] && continue + [[ -z "$expected_hash" || "$expected_hash" == "null" ]] && continue + + # Get actual hash from git + local actual_hash + actual_hash=$(get_git_ref_hash "$git_dir" "$ref_path") + + if [[ "$expected_hash" == "$actual_hash" ]]; then + matching=$((matching + 1)) + fi + done < <(echo "$event" | jq -c '.tags[] | select(.[0] | startswith("refs/heads/"))' 2>/dev/null) + + echo "$matching" +} + +# Categorize a single entry +# Args: $1=state_refs, $2=git_refs, $3=matches, $4=reason +# Returns: category number (1-4) +categorize_entry() { + local state_refs="$1" + local git_refs="$2" + local matches="$3" + local reason="$4" + + # Category 2: Empty/Blank + if [[ -n "$reason" ]] || [[ "$git_refs" -eq 0 ]]; then + echo "2" + return + fi + + # Category 1: Complete Match + if [[ "$state_refs" -gt 0 ]] && [[ "$state_refs" -eq "$git_refs" ]] && [[ "$matches" -eq "$state_refs" ]]; then + echo "1" + return + fi + + # Category 4: No Match + if [[ "$git_refs" -gt 0 ]] && [[ "$matches" -eq 0 ]]; then + echo "4" + return + fi + + # Category 3: Partial Match (default for anything else with matches > 0) + if [[ "$matches" -gt 0 ]]; then + echo "3" + return + fi + + # Fallback to category 2 + echo "2" +} + +# Format entry for category file +# Args: $1=repo, $2=npub, $3=state_refs, $4=git_refs, $5=matches, $6=reason +format_category_line() { + local repo="$1" + local npub="$2" + local state_refs="$3" + local git_refs="$4" + local matches="$5" + local reason="$6" + + if [[ -n "$reason" ]]; then + echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches | reason=$reason" + else + echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches" + fi +} + +# Process a single state event +# Args: $1=event_json, $2=git_base +# Outputs: TSV line to stdout +process_event() { + local event="$1" + local git_base="$2" + + # Extract repository identifier (d tag) + local identifier + identifier=$(echo "$event" | jq -r '.tags[] | select(.[0] == "d") | .[1]' 2>/dev/null | head -1 || echo "") + + if [[ -z "$identifier" ]]; then + return 1 + fi + + # Extract maintainer pubkey (hex) + local hex_pubkey + hex_pubkey=$(echo "$event" | jq -r '.pubkey' 2>/dev/null || echo "") + + if [[ -z "$hex_pubkey" ]]; then + return 1 + fi + + # Convert to npub + local npub + npub=$(hex_to_npub "$hex_pubkey") + + if [[ -z "$npub" ]]; then + return 1 + fi + + # Count state refs + local state_refs + state_refs=$(count_state_refs "$event") + + # Find git directory + local git_dir="$git_base/${npub}/${identifier}.git" + + # Check git directory status + local git_refs=0 + local matches=0 + local reason="" + + if [[ ! -d "$git_dir" ]]; then + reason="no_git_dir" + elif [[ ! -d "$git_dir/refs/heads" ]] && [[ ! -f "$git_dir/packed-refs" ]]; then + reason="empty_refs" + else + git_refs=$(count_git_refs "$git_dir") + + if [[ "$git_refs" -eq 0 ]]; then + reason="empty_refs" + elif [[ "$state_refs" -eq 0 ]]; then + reason="no_state_refs" + else + matches=$(count_matching_refs "$event" "$git_dir") + fi + fi + + # Output TSV line: repo, npub, state_refs, git_refs, matches, reason + printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$identifier" "$npub" "$state_refs" "$git_refs" "$matches" "$reason" +} + +# Main +main() { + local do_categorize=0 + local args=() + + # Parse arguments + for arg in "$@"; do + if [[ "$arg" == "--categorize" ]]; then + do_categorize=1 + else + args+=("$arg") + fi + done + + if [[ ${#args[@]} -ne 3 ]]; then + usage + fi + + local state_events_file="${args[0]}" + local git_base="${args[1]}" + local output_dir="${args[2]}" + + # Validate inputs + if [[ ! -f "$state_events_file" ]]; then + log_error "State events file not found: $state_events_file" + exit 1 + fi + + if [[ ! -d "$git_base" ]]; then + log_error "Git base directory not found: $git_base" + log_error "This script must run on the VPS with access to git directories." + exit 1 + fi + + # Check read permissions + if ! ls "$git_base" >/dev/null 2>&1; then + log_error "Cannot read git base directory (permission denied): $git_base" + log_error "Try running with sudo or grant read permissions." + exit 1 + fi + + check_prerequisites + + log_info "=== Git State Synchronization Check ===" + log_info "State events: $state_events_file" + log_info "Git base: $git_base" + log_info "Output: $output_dir" + if [[ $do_categorize -eq 1 ]]; then + log_info "Mode: TSV + categorization" + else + log_info "Mode: TSV only (use 20-categorize.sh for categories)" + fi + log_info "Started: $(date)" + echo "" + + # Create output directory + mkdir -p "$output_dir" + + # Output files + local tsv_file="$output_dir/git-sync-status.tsv" + + # Initialize TSV with header + echo -e "repo\tnpub\tstate_refs\tgit_refs\tmatches\treason" > "$tsv_file" + + # Initialize category files if categorizing + local cat1="" cat2="" cat3="" cat4="" + if [[ $do_categorize -eq 1 ]]; then + cat1="$output_dir/category1-complete-match.txt" + cat2="$output_dir/category2-empty-blank.txt" + cat3="$output_dir/category3-partial-match.txt" + cat4="$output_dir/category4-no-match.txt" + > "$cat1" + > "$cat2" + > "$cat3" + > "$cat4" + fi + + # Count total events + local total_events + total_events=$(wc -l < "$state_events_file" | tr -d ' ') + log_info "Processing $total_events state events..." + echo "" + + # Process each event + local count=0 + local processed=0 + local skipped=0 + local count_cat1=0 count_cat2=0 count_cat3=0 count_cat4=0 + local start_time + start_time=$(date +%s) + + while IFS= read -r event; do + count=$((count + 1)) + + # Skip empty lines + [[ -z "$event" ]] && continue + + # Process event + local result + if result=$(process_event "$event" "$git_base"); then + processed=$((processed + 1)) + + # Write to TSV (skip header line) + echo "$result" >> "$tsv_file" + + # Categorize if requested + if [[ $do_categorize -eq 1 ]]; then + # Parse result + IFS=$'\t' read -r repo npub state_refs git_refs matches reason <<< "$result" + + local category + category=$(categorize_entry "$state_refs" "$git_refs" "$matches" "$reason") + + local cat_line + cat_line=$(format_category_line "$repo" "$npub" "$state_refs" "$git_refs" "$matches" "$reason") + + case "$category" in + 1) echo "$cat_line" >> "$cat1"; count_cat1=$((count_cat1 + 1)) ;; + 2) echo "$cat_line" >> "$cat2"; count_cat2=$((count_cat2 + 1)) ;; + 3) echo "$cat_line" >> "$cat3"; count_cat3=$((count_cat3 + 1)) ;; + 4) echo "$cat_line" >> "$cat4"; count_cat4=$((count_cat4 + 1)) ;; + esac + fi + else + skipped=$((skipped + 1)) + fi + + # Progress indicator every 10 events + if [[ $((count % 10)) -eq 0 ]]; then + local elapsed=$(($(date +%s) - start_time)) + local rate=0 + if [[ $elapsed -gt 0 ]]; then + rate=$((count / elapsed)) + fi + local eta="?" + if [[ $rate -gt 0 ]]; then + eta=$(( (total_events - count) / rate )) + fi + log_progress "Processed $count/$total_events events (~${rate}/s, ETA: ${eta}s)..." + fi + done < "$state_events_file" + + # Clear progress line + echo "" >&2 + + local end_time + end_time=$(date +%s) + local duration=$((end_time - start_time)) + + # Summary + echo "" + log_info "=== Analysis Complete ===" + log_info "Finished: $(date)" + log_info "Duration: ${duration}s" + log_info "Processed: $processed events" + if [[ $skipped -gt 0 ]]; then + log_warn "Skipped: $skipped events (missing identifier or pubkey)" + fi + echo "" + + if [[ $do_categorize -eq 1 ]]; then + # Calculate percentages + local total=$((count_cat1 + count_cat2 + count_cat3 + count_cat4)) + local pct1=0 pct2=0 pct3=0 pct4=0 + if [[ $total -gt 0 ]]; then + pct1=$(awk "BEGIN {printf \"%.1f\", ($count_cat1/$total)*100}") + pct2=$(awk "BEGIN {printf \"%.1f\", ($count_cat2/$total)*100}") + pct3=$(awk "BEGIN {printf \"%.1f\", ($count_cat3/$total)*100}") + pct4=$(awk "BEGIN {printf \"%.1f\", ($count_cat4/$total)*100}") + fi + + log_info "=== Category Summary ===" + log_success "Category 1 (Complete Match): $count_cat1 ($pct1%)" + log_warn "Category 2 (Empty/Blank): $count_cat2 ($pct2%)" + log_warn "Category 3 (Partial Match): $count_cat3 ($pct3%)" + log_error "Category 4 (No Match): $count_cat4 ($pct4%)" + echo "" + + # Validation warning + if [[ $count_cat2 -eq $total ]] && [[ $total -gt 0 ]]; then + log_error "WARNING: 100% of repos categorized as Empty/Blank" + log_error "This usually indicates a permission or path issue." + echo "" + log_info "Troubleshooting:" + echo " 1. Verify git data exists: sudo ls -la $git_base | head -10" + echo " 2. Check sample repo: sudo find $git_base -name '*.git' -type d | head -1" + echo " 3. Re-run with sudo if not already using it" + echo "" + fi + fi + + log_info "Output files:" + echo " $tsv_file" + if [[ $do_categorize -eq 1 ]]; then + echo " $cat1" + echo " $cat2" + echo " $cat3" + echo " $cat4" + else + echo "" + log_info "Next step: Run 20-categorize.sh to categorize results" + echo " ./20-categorize.sh $tsv_file $output_dir" + fi +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/20-categorize.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/20-categorize.sh new file mode 100755 index 0000000..b38dc00 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/20-categorize.sh @@ -0,0 +1,212 @@ +#!/usr/bin/env bash +# +# 20-categorize.sh - Categorize git sync status into 4 categories +# +# PHASE 3a of the GRASP relay to ngit-grasp migration analysis pipeline. +# Takes git-sync-status.tsv from Phase 2 and categorizes into 4 files. +# +# USAGE: +# ./20-categorize.sh +# +# EXAMPLES: +# ./20-categorize.sh output/prod/git-sync-status.tsv output/prod +# ./20-categorize.sh output/archive/git-sync-status.tsv output/archive +# +# INPUT FORMAT (git-sync-status.tsv): +# Tab-separated values with columns: +# reponpubstate_refsgit_refsmatchesreason +# +# Where reason is optional and can be: no_git_dir, empty_refs, no_state_refs +# +# OUTPUT: +# /category1-complete-match.txt - All refs match perfectly +# /category2-empty-blank.txt - No git data available +# /category3-partial-match.txt - Some refs match +# /category4-no-match.txt - Git exists but refs don't match +# +# OUTPUT FORMAT: +# repo | npub | state_refs=N | git_refs=N | matches=N [| reason=X] +# +# CATEGORIES: +# 1. Complete Match: state_refs == git_refs == matches (all > 0) +# 2. Empty/Blank: git_refs == 0 OR reason in (no_git_dir, empty_refs, no_state_refs) +# 3. Partial Match: matches > 0 AND matches < state_refs +# 4. No Match: git_refs > 0 AND matches == 0 +# +# PREREQUISITES: +# - awk (standard Unix tool) +# +# RUNTIME: < 1 second (local processing only) +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 10-check-git-sync.sh - Phase 2 script that produces input for this script +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Arguments:" + echo " git-sync-status.tsv TSV file from Phase 2 (10-check-git-sync.sh)" + echo " output-dir Directory to store categorized output" + echo "" + echo "Examples:" + echo " $0 output/prod/git-sync-status.tsv output/prod" + echo " $0 output/archive/git-sync-status.tsv output/archive" + echo "" + echo "Input format (TSV):" + echo " reponpubstate_refsgit_refsmatchesreason" + echo "" + echo "Output files:" + echo " category1-complete-match.txt - All refs match" + echo " category2-empty-blank.txt - No git data" + echo " category3-partial-match.txt - Some refs match" + echo " category4-no-match.txt - Git exists, refs don't match" + exit 1 +} + +# Main +main() { + if [[ $# -ne 2 ]]; then + usage + fi + + local input_file="$1" + local output_dir="$2" + + # Validate input file + if [[ ! -f "$input_file" ]]; then + log_error "Input file not found: $input_file" + exit 1 + fi + + log_info "Categorizing git sync status" + log_info "Input: $input_file" + log_info "Output: $output_dir" + + # Create output directory + mkdir -p "$output_dir" + + # Output files + local cat1="$output_dir/category1-complete-match.txt" + local cat2="$output_dir/category2-empty-blank.txt" + local cat3="$output_dir/category3-partial-match.txt" + local cat4="$output_dir/category4-no-match.txt" + + # Clear previous results + > "$cat1" + > "$cat2" + > "$cat3" + > "$cat4" + + # Process input file with awk + # Input: reponpubstate_refsgit_refsmatchesreason + awk -F'\t' -v cat1="$cat1" -v cat2="$cat2" -v cat3="$cat3" -v cat4="$cat4" ' + BEGIN { + count1 = 0; count2 = 0; count3 = 0; count4 = 0 + } + NR == 1 && /^repo/ { next } # Skip header if present + NF >= 5 { + repo = $1 + npub = $2 + state_refs = int($3) + git_refs = int($4) + matches = int($5) + reason = (NF >= 6) ? $6 : "" + + # Format output line + if (reason != "") { + line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches " | reason=" reason + } else { + line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches + } + + # Categorize + if (reason == "no_git_dir" || reason == "empty_refs" || reason == "no_state_refs" || git_refs == 0) { + # Category 2: Empty/Blank + print line >> cat2 + count2++ + } else if (state_refs > 0 && state_refs == git_refs && matches == state_refs) { + # Category 1: Complete Match + print line >> cat1 + count1++ + } else if (matches > 0 && matches < state_refs) { + # Category 3: Partial Match + print line >> cat3 + count3++ + } else if (git_refs > 0 && matches == 0) { + # Category 4: No Match + print line >> cat4 + count4++ + } else if (matches > 0) { + # Edge case: matches > 0 but does not fit other categories + # This can happen when git_refs > state_refs but all state refs match + # Treat as partial match + print line >> cat3 + count3++ + } else { + # Fallback: treat as category 2 (empty/blank) + print line >> cat2 + count2++ + } + } + END { + total = count1 + count2 + count3 + count4 + print "COUNTS:" count1 ":" count2 ":" count3 ":" count4 ":" total + } + ' "$input_file" 2>&1 | while IFS= read -r line; do + if [[ "$line" =~ ^COUNTS: ]]; then + # Parse counts from awk output + IFS=':' read -r _ c1 c2 c3 c4 total <<< "$line" + + echo "" + log_info "=== Categorization Summary ===" + log_info "Total entries: $total" + log_success "Category 1 (Complete Match): $c1" + log_warn "Category 2 (Empty/Blank): $c2" + log_warn "Category 3 (Partial Match): $c3" + log_error "Category 4 (No Match): $c4" + echo "" + log_info "Output files:" + echo " $cat1" + echo " $cat2" + echo " $cat3" + echo " $cat4" + fi + done +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/21-compare-relays.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/21-compare-relays.sh new file mode 100755 index 0000000..b9c0d30 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/21-compare-relays.sh @@ -0,0 +1,294 @@ +#!/usr/bin/env bash +# +# 21-compare-relays.sh - Compare prod vs archive category files to find gaps +# +# PHASE 3b of the GRASP relay to ngit-grasp migration analysis pipeline. +# Compares categorized output from prod and archive to identify: +# - Repos complete in prod but missing/incomplete in archive +# - Repos in archive but not in prod +# - Status differences between relays +# +# USAGE: +# ./21-compare-relays.sh +# +# EXAMPLES: +# ./21-compare-relays.sh output/prod output/archive output/comparison +# +# INPUT: +# Both prod-dir and archive-dir must contain: +# - category1-complete-match.txt +# - category2-empty-blank.txt +# - category3-partial-match.txt +# - category4-no-match.txt +# +# OUTPUT: +# /complete-in-both.txt - Repos complete in both relays (no action) +# /complete-prod-missing-archive.txt - Complete in prod, not in archive cat1 +# /complete-prod-incomplete-archive.txt - Complete in prod, incomplete in archive +# /incomplete-in-both.txt - Incomplete in both relays +# /in-archive-not-prod.txt - In archive but not in prod +# /summary.txt - Human-readable summary +# +# OUTPUT FORMAT: +# Each file contains lines in the format: +# repo | npub | prod_status | archive_status +# +# PREREQUISITES: +# - awk, sort, comm (standard Unix tools) +# +# RUNTIME: < 1 second (local processing only) +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 20-categorize.sh - Phase 3a script that produces input for this script +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Arguments:" + echo " prod-dir Directory containing prod category files" + echo " archive-dir Directory containing archive category files" + echo " output-dir Directory to store comparison results" + echo "" + echo "Examples:" + echo " $0 output/prod output/archive output/comparison" + echo "" + echo "Required input files in each directory:" + echo " category1-complete-match.txt" + echo " category2-empty-blank.txt" + echo " category3-partial-match.txt" + echo " category4-no-match.txt" + exit 1 +} + +# Extract repo|npub key from category line +# Input: "repo | npub | state_refs=N | ..." +# Output: "repo|npub" +extract_key() { + awk -F' \\| ' '{print $1 "|" $2}' +} + +# Build lookup table from category files +# Args: $1=directory, $2=output_file +build_lookup() { + local dir="$1" + local output="$2" + + # Process all 4 category files + for cat in 1 2 3 4; do + local file="$dir/category${cat}-*.txt" + # shellcheck disable=SC2086 + if ls $file 1>/dev/null 2>&1; then + # shellcheck disable=SC2086 + cat $file | while IFS= read -r line; do + key=$(echo "$line" | extract_key) + echo "${key}|cat${cat}|${line}" + done + fi + done | sort -t'|' -k1,2 > "$output" +} + +# Main +main() { + if [[ $# -ne 3 ]]; then + usage + fi + + local prod_dir="$1" + local archive_dir="$2" + local output_dir="$3" + + # Validate input directories + for dir in "$prod_dir" "$archive_dir"; do + if [[ ! -d "$dir" ]]; then + log_error "Directory not found: $dir" + exit 1 + fi + if [[ ! -f "$dir/category1-complete-match.txt" ]]; then + log_error "Missing category1-complete-match.txt in $dir" + exit 1 + fi + done + + log_info "Comparing relay categories" + log_info "Prod: $prod_dir" + log_info "Archive: $archive_dir" + log_info "Output: $output_dir" + + # Create output directory + mkdir -p "$output_dir" + + # Create temp files for processing + local tmp_dir + tmp_dir=$(mktemp -d) + # shellcheck disable=SC2064 + trap "rm -rf '$tmp_dir'" EXIT + + log_info "Building lookup tables..." + + # Build lookup tables: key|category|full_line + build_lookup "$prod_dir" "$tmp_dir/prod_lookup.txt" + build_lookup "$archive_dir" "$tmp_dir/archive_lookup.txt" + + # Extract just keys for comparison + cut -d'|' -f1,2 "$tmp_dir/prod_lookup.txt" | sort -u > "$tmp_dir/prod_keys.txt" + cut -d'|' -f1,2 "$tmp_dir/archive_lookup.txt" | sort -u > "$tmp_dir/archive_keys.txt" + + log_info "Comparing categories..." + + # Initialize output files + > "$output_dir/complete-in-both.txt" + > "$output_dir/complete-prod-missing-archive.txt" + > "$output_dir/complete-prod-incomplete-archive.txt" + > "$output_dir/incomplete-in-both.txt" + > "$output_dir/in-archive-not-prod.txt" + + # Process prod category 1 (complete) entries + while IFS='|' read -r repo npub cat full_line; do + key="${repo}|${npub}" + + # Look up in archive + archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") + + if [[ -z "$archive_entry" ]]; then + # Not in archive at all + echo "$repo | $npub | prod=complete | archive=missing" >> "$output_dir/complete-prod-missing-archive.txt" + else + archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) + if [[ "$archive_cat" == "cat1" ]]; then + # Complete in both + echo "$repo | $npub | prod=complete | archive=complete" >> "$output_dir/complete-in-both.txt" + else + # Complete in prod, incomplete in archive + echo "$repo | $npub | prod=complete | archive=$archive_cat" >> "$output_dir/complete-prod-incomplete-archive.txt" + fi + fi + done < <(grep '|cat1|' "$tmp_dir/prod_lookup.txt" | sed 's/|cat1|/|cat1|/') + + # Process prod categories 2-4 (incomplete) entries + for cat in cat2 cat3 cat4; do + while IFS='|' read -r repo npub _ full_line; do + key="${repo}|${npub}" + + # Look up in archive + archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") + + if [[ -z "$archive_entry" ]]; then + # Incomplete in prod, missing in archive + echo "$repo | $npub | prod=$cat | archive=missing" >> "$output_dir/incomplete-in-both.txt" + else + archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) + if [[ "$archive_cat" != "cat1" ]]; then + # Incomplete in both + echo "$repo | $npub | prod=$cat | archive=$archive_cat" >> "$output_dir/incomplete-in-both.txt" + fi + # If archive is complete but prod is not, that's unusual but not an error + fi + done < <(grep "|${cat}|" "$tmp_dir/prod_lookup.txt") + done + + # Find entries in archive but not in prod + comm -23 "$tmp_dir/archive_keys.txt" "$tmp_dir/prod_keys.txt" | while IFS='|' read -r repo npub; do + key="${repo}|${npub}" + archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") + archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) + echo "$repo | $npub | prod=missing | archive=$archive_cat" >> "$output_dir/in-archive-not-prod.txt" + done + + # Count results + local count_both count_missing count_incomplete count_both_incomplete count_archive_only + count_both=$(wc -l < "$output_dir/complete-in-both.txt" | tr -d ' ') + count_missing=$(wc -l < "$output_dir/complete-prod-missing-archive.txt" | tr -d ' ') + count_incomplete=$(wc -l < "$output_dir/complete-prod-incomplete-archive.txt" | tr -d ' ') + count_both_incomplete=$(wc -l < "$output_dir/incomplete-in-both.txt" | tr -d ' ') + count_archive_only=$(wc -l < "$output_dir/in-archive-not-prod.txt" | tr -d ' ') + + # Generate summary + cat > "$output_dir/summary.txt" << EOF +# Relay Comparison Summary +Generated: $(date -Iseconds) + +## Input +- Prod: $prod_dir +- Archive: $archive_dir + +## Results + +### No Action Required +- Complete in both relays: $count_both + +### Action/Decision Required +- Complete in prod, MISSING from archive: $count_missing +- Complete in prod, INCOMPLETE in archive: $count_incomplete +- Incomplete in BOTH relays: $count_both_incomplete + +### For Reference +- In archive but not in prod: $count_archive_only + +## Files +- complete-in-both.txt: Repos successfully migrated (no action) +- complete-prod-missing-archive.txt: Need investigation - why not in archive? +- complete-prod-incomplete-archive.txt: Archive sync may still be in progress +- incomplete-in-both.txt: Git data incomplete on both relays +- in-archive-not-prod.txt: May be deleted from prod or new to archive + +## Next Steps +1. Review complete-prod-missing-archive.txt - these repos need attention +2. Check if archive sync is still running for incomplete entries +3. Cross-reference with deletion events (kind 5) from Phase 1 +4. Use Phase 4 logs to understand parse failures and purgatory expiry +EOF + + # Display summary + echo "" + log_info "=== Comparison Summary ===" + log_success "Complete in both: $count_both (no action needed)" + log_error "Complete in prod, MISSING from archive: $count_missing" + log_warn "Complete in prod, incomplete in archive: $count_incomplete" + log_warn "Incomplete in both: $count_both_incomplete" + log_info "In archive only: $count_archive_only" + echo "" + log_info "Output files:" + echo " $output_dir/complete-in-both.txt" + echo " $output_dir/complete-prod-missing-archive.txt" + echo " $output_dir/complete-prod-incomplete-archive.txt" + echo " $output_dir/incomplete-in-both.txt" + echo " $output_dir/in-archive-not-prod.txt" + echo " $output_dir/summary.txt" +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh new file mode 100755 index 0000000..76521d4 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh @@ -0,0 +1,390 @@ +#!/usr/bin/env bash +# +# 22-compare-git-data.sh - Compare actual git data between prod and archive relays +# +# PHASE 3c of the GRASP relay to ngit-grasp migration analysis pipeline. +# Compares actual git commits between prod and archive to determine which is ahead. +# +# KEY INSIGHT: +# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event. +# If archive has different/newer data than prod, it means: +# - A state event authorized those commits at some point +# - Archive is actually MORE up-to-date than prod +# - Migration should use archive data (it's already correct) +# +# USAGE: +# ./22-compare-git-data.sh +# +# EXAMPLES: +# ./22-compare-git-data.sh /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \ +# output/comparison/complete-prod-incomplete-archive.txt output/comparison +# +# INPUT: +# prod-git-base Base directory for prod git repos (e.g., /var/lib/grasp-relay/git) +# archive-git-base Base directory for archive git repos (e.g., /var/lib/ngit-grasp/git) +# repo-list File with repos to compare (format: "repo | npub | ...") +# +# OUTPUT: +# /git-ancestry.tsv - Tab-separated values: +# reponpubrelationshipdetails +# +# Relationship values: +# archive-ahead - Archive has all prod commits plus more (GOOD - use archive) +# in-sync - Both have identical commits +# prod-ahead - Prod has commits archive is missing (needs re-sync) +# diverged - Both have unique commits (manual review) +# archive-only - Only archive has git data +# prod-only - Only prod has git data +# both-empty - Neither has git data +# +# PREREQUISITES: +# - git (for ref comparison) +# - Read access to both git directories (may need sudo) +# +# RUNTIME: Depends on number of repos to compare +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 21-compare-relays.sh - Phase 3b script that identifies repos to compare +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_progress() { + echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Arguments:" + echo " prod-git-base Base directory for prod git repos" + echo " archive-git-base Base directory for archive git repos" + echo " repo-list File with repos to compare (format: 'repo | npub | ...')" + echo " output-dir Directory to store output files" + echo "" + echo "Examples:" + echo " $0 /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \\" + echo " output/comparison/complete-prod-incomplete-archive.txt output/comparison" + echo "" + echo "Output:" + echo " git-ancestry.tsv - TSV with: repo, npub, relationship, details" + exit 1 +} + +# Get all branch refs from a git directory +# Args: $1=git_dir +# Returns: sorted list of "ref_name commit_hash" lines +get_git_refs() { + local git_dir="$1" + + if [[ ! -d "$git_dir" ]]; then + return + fi + + git --git-dir="$git_dir" show-ref --heads 2>/dev/null | sort || true +} + +# Check if commit A is ancestor of commit B +# Args: $1=git_dir, $2=commit_a, $3=commit_b +# Returns: 0 if A is ancestor of B, 1 otherwise +is_ancestor() { + local git_dir="$1" + local commit_a="$2" + local commit_b="$3" + + git --git-dir="$git_dir" merge-base --is-ancestor "$commit_a" "$commit_b" 2>/dev/null +} + +# Compare git data between prod and archive for a single repo +# Args: $1=prod_git_dir, $2=archive_git_dir +# Returns: relationship string +compare_repo_git() { + local prod_git="$1" + local archive_git="$2" + + local prod_exists=false + local archive_exists=false + + [[ -d "$prod_git" ]] && prod_exists=true + [[ -d "$archive_git" ]] && archive_exists=true + + # Handle cases where one or both don't exist + if [[ "$prod_exists" == "false" && "$archive_exists" == "false" ]]; then + echo "both-empty" + return + fi + + if [[ "$prod_exists" == "false" ]]; then + echo "archive-only" + return + fi + + if [[ "$archive_exists" == "false" ]]; then + echo "prod-only" + return + fi + + # Both exist - get refs + local prod_refs archive_refs + prod_refs=$(get_git_refs "$prod_git") + archive_refs=$(get_git_refs "$archive_git") + + # Handle empty refs + if [[ -z "$prod_refs" && -z "$archive_refs" ]]; then + echo "both-empty" + return + fi + + if [[ -z "$prod_refs" ]]; then + echo "archive-only" + return + fi + + if [[ -z "$archive_refs" ]]; then + echo "prod-only" + return + fi + + # Compare refs - check if they're identical + if [[ "$prod_refs" == "$archive_refs" ]]; then + echo "in-sync" + return + fi + + # Refs differ - need to check ancestry + # Strategy: For each branch, check if one is ancestor of the other + # If all archive branches are ahead of or equal to prod branches, archive is ahead + # If all prod branches are ahead of or equal to archive branches, prod is ahead + # Otherwise, they've diverged + + local archive_ahead=true + local prod_ahead=true + local has_common_branch=false + + # Create temporary file to use archive as reference repo for ancestry checks + # We need a repo that has both sets of commits to check ancestry + # Use archive since it's the target and should have the superset + + # Check each prod branch against archive + while read -r prod_hash prod_ref; do + [[ -z "$prod_hash" ]] && continue + + # Get the same branch from archive + local archive_hash + archive_hash=$(echo "$archive_refs" | grep " $prod_ref$" | awk '{print $1}' || echo "") + + if [[ -z "$archive_hash" ]]; then + # Branch exists in prod but not archive - prod has something archive doesn't + # But this could be a deleted branch, so don't immediately say prod is ahead + continue + fi + + has_common_branch=true + + if [[ "$prod_hash" == "$archive_hash" ]]; then + # Same commit - neither ahead for this branch + continue + fi + + # Different commits - check ancestry + # First, try to check if prod is ancestor of archive (archive ahead) + if is_ancestor "$archive_git" "$prod_hash" "$archive_hash" 2>/dev/null; then + # Prod commit is ancestor of archive commit - archive is ahead for this branch + prod_ahead=false + elif is_ancestor "$archive_git" "$archive_hash" "$prod_hash" 2>/dev/null; then + # Archive commit is ancestor of prod commit - prod is ahead for this branch + archive_ahead=false + else + # Neither is ancestor - diverged + archive_ahead=false + prod_ahead=false + fi + done <<< "$prod_refs" + + # Also check for branches only in archive (archive has extra branches) + while read -r archive_hash archive_ref; do + [[ -z "$archive_hash" ]] && continue + + local prod_hash + prod_hash=$(echo "$prod_refs" | grep " $archive_ref$" | awk '{print $1}' || echo "") + + if [[ -z "$prod_hash" ]]; then + # Branch exists in archive but not prod - archive has something prod doesn't + # This means archive is ahead (has extra branches) + prod_ahead=false + fi + done <<< "$archive_refs" + + # Determine final relationship + if [[ "$has_common_branch" == "false" ]]; then + # No common branches - completely different + echo "diverged" + return + fi + + if [[ "$archive_ahead" == "true" && "$prod_ahead" == "false" ]]; then + echo "archive-ahead" + elif [[ "$prod_ahead" == "true" && "$archive_ahead" == "false" ]]; then + echo "prod-ahead" + elif [[ "$archive_ahead" == "true" && "$prod_ahead" == "true" ]]; then + # Both true means all common branches are identical + # But one might have extra branches + echo "in-sync" + else + echo "diverged" + fi +} + +# Main +main() { + if [[ $# -ne 4 ]]; then + usage + fi + + local prod_git_base="$1" + local archive_git_base="$2" + local repo_list="$3" + local output_dir="$4" + + # Validate inputs + if [[ ! -d "$prod_git_base" ]]; then + log_error "Prod git base directory not found: $prod_git_base" + exit 1 + fi + + if [[ ! -d "$archive_git_base" ]]; then + log_error "Archive git base directory not found: $archive_git_base" + exit 1 + fi + + if [[ ! -f "$repo_list" ]]; then + log_error "Repo list file not found: $repo_list" + exit 1 + fi + + log_info "=== Git Data Comparison ===" + log_info "Prod git base: $prod_git_base" + log_info "Archive git base: $archive_git_base" + log_info "Repo list: $repo_list" + log_info "Output: $output_dir" + log_info "Started: $(date)" + echo "" + + # Create output directory + mkdir -p "$output_dir" + + # Output file + local tsv_file="$output_dir/git-ancestry.tsv" + + # Initialize TSV with header + echo -e "repo\tnpub\trelationship\tdetails" > "$tsv_file" + + # Count repos + local total_repos + total_repos=$(grep -c -v '^#' "$repo_list" 2>/dev/null || echo "0") + log_info "Processing $total_repos repos..." + echo "" + + # Counters + local count=0 + local count_archive_ahead=0 + local count_in_sync=0 + local count_prod_ahead=0 + local count_diverged=0 + local count_archive_only=0 + local count_prod_only=0 + local count_both_empty=0 + + # Process each repo + while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do + # Skip comments and empty lines + [[ "$repo" =~ ^# ]] && continue + [[ -z "$repo" ]] && continue + + # Clean up whitespace + repo="${repo// /}" + npub="${npub// /}" + + [[ -z "$repo" || -z "$npub" ]] && continue + + count=$((count + 1)) + + # Build git paths + local prod_git="$prod_git_base/${npub}/${repo}.git" + local archive_git="$archive_git_base/${npub}/${repo}.git" + + # Compare + local relationship details="" + relationship=$(compare_repo_git "$prod_git" "$archive_git") + + # Count by relationship + case "$relationship" in + archive-ahead) count_archive_ahead=$((count_archive_ahead + 1)) ;; + in-sync) count_in_sync=$((count_in_sync + 1)) ;; + prod-ahead) count_prod_ahead=$((count_prod_ahead + 1)) ;; + diverged) count_diverged=$((count_diverged + 1)) ;; + archive-only) count_archive_only=$((count_archive_only + 1)) ;; + prod-only) count_prod_only=$((count_prod_only + 1)) ;; + both-empty) count_both_empty=$((count_both_empty + 1)) ;; + esac + + # Output TSV line + printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$relationship" "$details" >> "$tsv_file" + + # Progress indicator every 10 repos + if [[ $((count % 10)) -eq 0 ]]; then + log_progress "Processed $count/$total_repos repos..." + fi + done < "$repo_list" + + # Clear progress line + echo "" >&2 + + # Summary + echo "" + log_info "=== Comparison Summary ===" + log_success "Archive ahead (use archive data): $count_archive_ahead" + log_success "In sync: $count_in_sync" + log_warn "Prod ahead (needs re-sync): $count_prod_ahead" + log_error "Diverged (manual review): $count_diverged" + log_info "Archive only: $count_archive_only" + log_info "Prod only: $count_prod_only" + log_info "Both empty: $count_both_empty" + echo "" + log_info "Total: $count repos" + log_info "Output: $tsv_file" +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/30-extract-parse-failures.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/30-extract-parse-failures.sh new file mode 100755 index 0000000..d762aae --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/30-extract-parse-failures.sh @@ -0,0 +1,774 @@ +#!/usr/bin/env bash +# +# 30-extract-parse-failures.sh - Extract parse failure events from systemd logs +# +# PHASE 4a of the GRASP relay to ngit-grasp migration analysis pipeline. +# Extracts structured [PARSE_FAIL] log entries AND "Invalid announcement" +# rejections from journalctl. +# +# USAGE: +# ./30-extract-parse-failures.sh [options] +# +# EXAMPLES: +# # Extract from ngit-grasp service (last 30 days, default) +# ./30-extract-parse-failures.sh ngit-grasp.service output/logs +# +# # Extract with custom time range +# ./30-extract-parse-failures.sh ngit-grasp.service output/logs --since "2026-01-01" +# +# # Extract from specific time window +# ./30-extract-parse-failures.sh ngit-grasp.service output/logs --since "2026-01-15" --until "2026-01-22" +# +# OPTIONS: +# --since Start date for log extraction (default: 30 days ago) +# --until End date for log extraction (default: now) +# --dry-run Show what would be extracted without writing files +# +# ENRICHMENT: +# The script automatically enriches parse failures with repo/npub information +# by extracting from "Added rejected announcement" log entries which include +# pubkey and identifier fields. Hex pubkeys are converted to npub format using +# `nak encode npub ` if the nak tool is available. +# +# OUTPUT: +# /parse-failures.txt +# +# OUTPUT FORMAT (TSV): +# event_idkindreasonreponpub +# +# EXPECTED LOG FORMATS: +# The script looks for three types of log entries: +# +# 1. Structured [PARSE_FAIL] entries: +# 2026-01-22T10:30:45Z ngit-grasp[1234]: [PARSE_FAIL] kind=30618 event_id=abc123... reason="invalid refs format" repo=myrepo npub=npub1... +# +# 2. "Invalid announcement" rejections (write policy): +# Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... +# +# 3. "Added rejected announcement" entries (for enrichment): +# Added rejected announcement to two-tier index event_id=abc123... kind=30617 identifier=myrepo pubkey=hex... +# These entries provide pubkey and identifier for enriching write policy rejections. +# +# NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted +# because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting +# both would cause double-counting since deduplication only works within each format. +# Write policy logs contain the same events, so we don't lose any data. +# +# Required fields: kind, event_id, reason +# Enrichment fields: repo (identifier), npub (converted from hex pubkey) +# +# DEPENDENCY: +# This script requires logging improvements in ngit-grasp to emit structured +# [PARSE_FAIL] log entries. Until those are implemented, this script will +# find no matching entries (which is handled gracefully). +# +# "Invalid announcement" rejections are logged by the write policy and +# should be present in any ngit-grasp deployment. +# +# See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section) +# +# Expected Rust logging code for [PARSE_FAIL]: +# tracing::warn!( +# target: "migration", +# "[PARSE_FAIL] kind={} event_id={} reason=\"{}\" repo={} npub={}", +# event.kind, event.id, reason, identifier, npub +# ); +# +# PREREQUISITES: +# - journalctl (systemd) +# - grep, awk, sed (standard Unix tools) +# - Access to systemd journal (may require sudo or journal group membership) +# +# RUNTIME: Depends on log volume, typically < 30 seconds +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 31-extract-purgatory-expiry.sh - Companion script for purgatory expiry logs +# + +set -euo pipefail + +# Get script directory for sourcing helpers +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source the service validation helper +if [[ -f "$SCRIPT_DIR/validate-service.sh" ]]; then + source "$SCRIPT_DIR/validate-service.sh" +fi + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 [options]" + echo "" + echo "Arguments:" + echo " service-name Systemd service name (e.g., ngit-grasp.service)" + echo " output-dir Directory to store extracted log data" + echo "" + echo "Options:" + echo " --since Start date (default: 30 days ago)" + echo " --until End date (default: now)" + echo " --dry-run Show what would be extracted without writing" + echo "" + echo "Examples:" + echo " $0 ngit-grasp.service output/logs" + echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" + echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" + echo "" + echo "Expected log formats:" + echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." + echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." + echo "" + echo "Enrichment:" + echo " Parse failures are automatically enriched with repo/npub from" + echo " 'Added rejected announcement' log entries. Hex pubkeys are converted" + echo " to npub format using 'nak encode npub' if available." + exit 1 +} + +# ============================================================================= +# AWK-BASED BATCH PARSING FUNCTIONS +# ============================================================================= +# These functions use awk for efficient batch processing instead of per-line +# grep calls. This provides ~400x speedup for large log files. +# +# NOTE: parse_builder_rejection_line() was removed to fix double-counting bug. +# Builder logs use bech32 (note1) IDs while write policy logs use hex IDs. +# Since deduplication only works within each format, extracting both caused +# the same event to be counted twice. Write policy logs contain the same +# events, so we don't lose any data by only extracting from that source. + +# Parse [PARSE_FAIL] log lines in batch using awk +# Input: file containing log lines with [PARSE_FAIL] +# Output: TSV lines: event_idkindreasonreponpub +parse_parse_fail_batch() { + local input_file="$1" + awk ' + { + # Extract kind=VALUE + kind = "" + if (match($0, /kind=([0-9]+)/, m)) kind = m[1] + + # Extract event_id=VALUE (hex string) + event_id = "" + if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1] + + # Extract reason="VALUE" (quoted string) + reason = "" + if (match($0, /reason="([^"]*)"/, m)) reason = m[1] + + # Extract repo=VALUE (optional) + repo = "" + if (match($0, /repo=([^ ]+)/, m)) repo = m[1] + + # Extract npub=VALUE (optional) + npub = "" + if (match($0, /npub=([^ ]+)/, m)) npub = m[1] + + # Output if we have required fields + if (kind != "" && event_id != "" && reason != "") { + print event_id "\t" kind "\t" reason "\t" repo "\t" npub + } + } + ' "$input_file" +} + +# Parse "Invalid announcement" rejection log lines in batch using awk +# Input: file containing "Event rejected by write policy" log lines +# Output: TSV lines: event_idkindreason +parse_write_policy_rejection_batch() { + local input_file="$1" + awk ' + { + # Extract event_id=VALUE (hex string) + event_id = "" + if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1] + + # Extract kind=VALUE + kind = "" + if (match($0, /kind=([0-9]+)/, m)) kind = m[1] + + # Extract reason=VALUE (everything after "reason=") + reason = "" + if (match($0, /reason=(.*)$/, m)) reason = m[1] + + # Output if we have required fields (repo and npub are empty) + if (kind != "" && event_id != "" && reason != "") { + print event_id "\t" kind "\t" reason "\t\t" + } + } + ' "$input_file" +} + +# Parse "Added rejected announcement" log lines in batch using awk +# Input: file containing "Added rejected announcement to two-tier index" log lines +# Output: TSV lines: event_ididentifierpubkey_hex +parse_rejected_announcement_batch() { + local input_file="$1" + awk ' + { + # Extract event_id=VALUE (hex string) + event_id = "" + if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1] + + # Extract identifier=VALUE (repo name) + identifier = "" + if (match($0, /identifier=([^ ]+)/, m)) identifier = m[1] + + # Extract pubkey=VALUE (hex string) + pubkey = "" + if (match($0, /pubkey=([a-f0-9]+)/, m)) pubkey = m[1] + + # Output if we have all required fields + if (event_id != "" && identifier != "" && pubkey != "") { + print event_id "\t" identifier "\t" pubkey + } + } + ' "$input_file" +} + +# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries +# This is critical because "Invalid announcement" rejections only log event_id and kind, +# not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead +# of repo|npub in action-required.txt, making the output unusable. +# +# Arguments: +# $1 - parse failures file to enrich (modified in place) +# $2 - lookup file containing event_id -> identifier|pubkey mappings from logs +# +# The function: +# 1. Uses the lookup table built from "Added rejected announcement" log entries +# 2. For each parse failure with empty repo/npub, looks up the event_id +# 3. Populates repo and npub columns from the lookup +# 4. Converts hex pubkeys to npub format using `nak encode npub` if available +# +# OPTIMIZATION: This function uses batch processing for efficiency: +# - Uses awk for O(n) join instead of per-line grep (O(n*m)) +# - Batches all pubkey->npub conversions in a single nak call +# - This reduces runtime from minutes to seconds for large datasets +enrich_with_repo_npub() { + local parse_failures_file="$1" + local lookup_file="$2" + + # Validate lookup file exists and has content + if [[ ! -f "$lookup_file" ]] || [[ ! -s "$lookup_file" ]]; then + log_warn "No enrichment data available - repo/npub columns will remain empty" + return 0 + fi + + log_info "Enriching parse failures with repo/npub from log entries..." + + # Check if we have nak for pubkey->npub conversion + local can_convert_npub=false + if command -v nak &> /dev/null; then + can_convert_npub=true + log_info " Using 'nak' for pubkey->npub conversion" + else + log_warn " 'nak' not found - will use hex pubkeys instead of npub" + fi + + local lookup_count + lookup_count=$(wc -l < "$lookup_file") + lookup_count="${lookup_count//[^0-9]/}" + log_info " Lookup table has $lookup_count entries" + + # STEP 1: Extract unique pubkeys that need conversion + # Get pubkeys from lookup file (column 3), deduplicate + local unique_pubkeys_file npub_map_file + unique_pubkeys_file=$(mktemp) + npub_map_file=$(mktemp) + + cut -f3 "$lookup_file" | sort -u > "$unique_pubkeys_file" + local unique_pubkey_count + unique_pubkey_count=$(wc -l < "$unique_pubkeys_file") + unique_pubkey_count="${unique_pubkey_count//[^0-9]/}" + log_info " Converting $unique_pubkey_count unique pubkeys to npub format..." + + # STEP 2: Batch convert all pubkeys to npub in a single nak call + # nak reads hex pubkeys from stdin (one per line) and outputs npubs + if [[ "$can_convert_npub" == true && "$unique_pubkey_count" -gt 0 ]]; then + # Create mapping file: pubkey_hexnpub + # nak encode npub reads from stdin and outputs one npub per line + paste "$unique_pubkeys_file" <(nak encode npub < "$unique_pubkeys_file" 2>/dev/null) > "$npub_map_file" || { + # Fallback: if batch conversion fails, use hex pubkeys + log_warn " Batch npub conversion failed, using hex pubkeys" + awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file" + } + else + # No nak available, use hex pubkeys as-is + awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file" + fi + + rm -f "$unique_pubkeys_file" + + # STEP 3: Use awk for efficient join (O(n) instead of O(n*m) grep per line) + # This joins parse_failures with lookup_file on event_id, then with npub_map on pubkey + local enriched_file + enriched_file=$(mktemp) + + # Copy header lines + grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true + + # Use awk to perform the join efficiently + # Input files (order matters for ARGIND): + # 1. npub_map_file: pubkey_hexnpub + # 2. lookup_file: event_ididentifierpubkey_hex + # 3. parse_failures_file: event_idkindreasonreponpub + awk -F'\t' -v OFS='\t' ' + # Track which file we are processing + FNR==1 { file_num++ } + + # First file: npub_map (pubkey_hex -> npub) + file_num==1 { + npub_map[$1] = $2 + next + } + # Second file: lookup (event_id -> identifier, pubkey_hex) + file_num==2 { + lookup_repo[$1] = $2 + lookup_pubkey[$1] = $3 + next + } + # Third file: parse_failures + /^#/ { next } # Skip headers (already copied) + { + event_id = $1 + kind = $2 + reason = $3 + repo = $4 + npub = $5 + + # If repo/npub empty, try to enrich from lookup + if (repo == "" && event_id in lookup_repo) { + repo = lookup_repo[event_id] + } + if (npub == "" && event_id in lookup_pubkey) { + pubkey = lookup_pubkey[event_id] + if (pubkey in npub_map) { + npub = npub_map[pubkey] + } else { + npub = pubkey # Fallback to hex + } + } + + print event_id, kind, reason, repo, npub + } + ' "$npub_map_file" "$lookup_file" "$parse_failures_file" >> "$enriched_file" + + rm -f "$npub_map_file" + + # Count enriched entries + local enriched_count total_count + total_count=$(grep -v '^#' "$parse_failures_file" | wc -l) + total_count="${total_count//[^0-9]/}" + # Count entries that have non-empty repo AND npub after enrichment + enriched_count=$(grep -v '^#' "$enriched_file" | awk -F'\t' '$4 != "" && $5 != ""' | wc -l) + enriched_count="${enriched_count//[^0-9]/}" + + # Replace original with enriched version + mv "$enriched_file" "$parse_failures_file" + + log_info " Enriched $enriched_count of $total_count parse failures with repo/npub" + log_success "Enrichment complete" +} + +# Parse "Added rejected announcement" log entries to build enrichment lookup table +# Input: log line containing "Added rejected announcement to two-tier index" +# Output: TSV line: event_ididentifierpubkey_hex +parse_rejected_announcement_line() { + local line="$1" + + local event_id identifier pubkey_hex + + # Extract event_id=VALUE (hex string) + event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "") + + # Extract identifier=VALUE (repo name) + identifier=$(echo "$line" | grep -oP 'identifier=\K[^ ]+' || echo "") + + # Extract pubkey=VALUE (hex string) + pubkey_hex=$(echo "$line" | grep -oP 'pubkey=\K[a-f0-9]+' || echo "") + + # Only output if we have all required fields + if [[ -n "$event_id" && -n "$identifier" && -n "$pubkey_hex" ]]; then + printf '%s\t%s\t%s\n' "$event_id" "$identifier" "$pubkey_hex" + fi +} + +# Main +main() { + if [[ $# -lt 2 ]]; then + usage + fi + + local service="$1" + local output_dir="$2" + shift 2 + + # Default time range: last 30 days + local since_date + since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") + local until_date="" + local dry_run=false + + # Parse options + while [[ $# -gt 0 ]]; do + case "$1" in + --since) + since_date="$2" + shift 2 + ;; + --until) + until_date="$2" + shift 2 + ;; + --dry-run) + dry_run=true + shift + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done + + # Validate service name format + if [[ ! "$service" =~ \.service$ ]]; then + service="${service}.service" + fi + + # Validate service is appropriate for structured logging + # This prevents the common mistake of using ngit-relay instead of ngit-grasp + if type validate_service_for_structured_logging &>/dev/null; then + # Use non-interactive mode if not a terminal, skip log check (we'll do our own) + local interactive="true" + [[ ! -t 0 ]] && interactive="false" + + if ! validate_service_for_structured_logging "$service" "false" "$interactive"; then + log_error "Service validation failed. Use an ngit-grasp service for structured logging." + exit 1 + fi + else + # Fallback validation if helper not available + if [[ "$service" == *"ngit-relay"* ]]; then + log_error "Service name appears to be ngit-relay: $service" + log_error "Structured logging ([PARSE_FAIL]) only exists in ngit-grasp services." + log_error "Please use the ngit-grasp archive service instead." + log_error "" + log_error "To find the correct service:" + log_error " systemctl list-units 'ngit-grasp*' --all" + exit 1 + fi + fi + + log_info "Extracting parse failures from systemd logs" + log_info "Service: $service" + log_info "Output: $output_dir" + log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" + + # Check if journalctl is available + if ! command -v journalctl &> /dev/null; then + log_error "journalctl not found. This script requires systemd." + exit 1 + fi + + # Validate service exists (check if journalctl can find any logs for it) + # Note: We don't require the service to be running, just that it has logs + if ! journalctl --no-pager -u "$service" -n 1 &>/dev/null; then + log_warn "Could not query logs for service: $service" + log_warn "This may indicate the service doesn't exist or you lack permissions." + log_warn "" + log_warn "To list available ngit-grasp services:" + log_warn " systemctl list-units 'ngit-grasp*' --all" + log_warn " journalctl --list-boots # Check if you have journal access" + log_warn "" + # Continue anyway - the service might exist but have no logs yet + fi + + # Build journalctl command + local journal_cmd="journalctl -u $service --no-pager -o short-iso" + + if [[ -n "$since_date" ]]; then + journal_cmd="$journal_cmd --since '$since_date'" + fi + + if [[ -n "$until_date" ]]; then + journal_cmd="$journal_cmd --until '$until_date'" + fi + + log_info "Running: $journal_cmd | grep '[PARSE_FAIL]' or 'Invalid announcement'" + + if [[ "$dry_run" == true ]]; then + log_info "[DRY RUN] Would extract to: $output_dir/parse-failures.txt" + + # Show sample of what would be extracted + log_info "Checking for matching log entries..." + local parse_fail_count invalid_announcement_count + parse_fail_count=$(eval "$journal_cmd" 2>/dev/null | grep -c '\[PARSE_FAIL\]' || echo "0") + parse_fail_count="${parse_fail_count//[^0-9]/}" # Strip non-numeric characters + parse_fail_count="${parse_fail_count:-0}" + + invalid_announcement_count=$(eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep -c 'Invalid announcement' || echo "0") + invalid_announcement_count="${invalid_announcement_count//[^0-9]/}" + invalid_announcement_count="${invalid_announcement_count:-0}" + + log_info "Found $parse_fail_count [PARSE_FAIL] entries" + log_info "Found $invalid_announcement_count 'Invalid announcement' rejections" + + if [[ "$parse_fail_count" -eq 0 && "$invalid_announcement_count" -eq 0 ]]; then + log_warn "No matching entries found in logs." + log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." + log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" + fi + + exit 0 + fi + + # Create output directory + mkdir -p "$output_dir" + + local output_file="$output_dir/parse-failures.txt" + local temp_file + temp_file=$(mktemp) + + # Extract and parse log entries using streaming (avoids loading all logs into memory) + log_info "Extracting log entries..." + + # Create temp files for intermediate results + local temp_stderr temp_parse_fail temp_write_policy_rejection temp_rejected_announcement + temp_stderr=$(mktemp) + temp_parse_fail=$(mktemp) + temp_write_policy_rejection=$(mktemp) + temp_rejected_announcement=$(mktemp) + + # Extract [PARSE_FAIL] entries directly to temp file (streaming) + log_info " Searching for [PARSE_FAIL] entries..." + eval "$journal_cmd" 2>"$temp_stderr" | grep '\[PARSE_FAIL\]' > "$temp_parse_fail" || true + + local journal_stderr + journal_stderr=$(cat "$temp_stderr" 2>/dev/null || true) + if [[ -n "$journal_stderr" ]]; then + log_warn "journalctl reported: $journal_stderr" + fi + + # Extract "Event rejected by write policy" with "Invalid announcement" (streaming) + # NOTE: We only extract from write policy logs (hex IDs), not builder logs (note1 IDs) + # to avoid double-counting. Both log sources contain the same events. + log_info " Searching for write policy rejections..." + eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true + + # Extract "Added rejected announcement" entries for enrichment (streaming) + # These contain pubkey and identifier which we use to enrich write policy rejections + log_info " Searching for rejected announcement entries (for enrichment)..." + eval "$journal_cmd" 2>/dev/null | grep 'Added rejected announcement to two-tier index' > "$temp_rejected_announcement" || true + + rm -f "$temp_stderr" + + # Check if we found anything + local parse_fail_line_count write_policy_line_count rejected_announcement_line_count + parse_fail_line_count=$(wc -l < "$temp_parse_fail") + parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" + write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") + write_policy_line_count="${write_policy_line_count//[^0-9]/}" + rejected_announcement_line_count=$(wc -l < "$temp_rejected_announcement") + rejected_announcement_line_count="${rejected_announcement_line_count//[^0-9]/}" + + log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" + log_info " Found $write_policy_line_count write policy rejection log lines" + log_info " Found $rejected_announcement_line_count rejected announcement log lines (for enrichment)" + + local total_invalid_announcement_lines=$write_policy_line_count + + if [[ "$parse_fail_line_count" -eq 0 && "$total_invalid_announcement_lines" -eq 0 ]]; then + log_warn "No matching entries found in logs." + log_warn "" + log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." + log_warn "The script looks for:" + log_warn "" + log_warn " 1. [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." + log_warn " 2. Event rejected by write policy event_id=... kind=30617 reason=Invalid announcement: ..." + log_warn "" + log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" + log_warn "" + + # Create empty output file with header comment + { + echo "# Parse failures and invalid announcements extracted from $service" + echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" + echo "# Extracted: $(date -Iseconds)" + echo "#" + echo "# Includes:" + echo "# - [PARSE_FAIL] structured log entries" + echo "# - \"Invalid announcement\" rejections" + echo "#" + echo "# Format: event_idkindreasonreponpub" + echo "# Note: repo and npub may be empty for some entries" + echo "#" + echo "# NOTE: No matching entries found." + echo "# This is expected if ngit-grasp logging improvements are not yet deployed." + } > "$output_file" + + rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" + log_info "Created empty output file: $output_file" + exit 0 + fi + + # Write header + { + echo "# Parse failures and invalid announcements extracted from $service" + echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" + echo "# Extracted: $(date -Iseconds)" + echo "#" + echo "# Includes:" + echo "# - [PARSE_FAIL] structured log entries" + echo "# - \"Invalid announcement\" rejections" + echo "#" + echo "# Format: event_idkindreasonreponpub" + echo "# Note: repo and npub may be empty for some entries" + } > "$output_file" + + # Parse [PARSE_FAIL] entries using batch awk processing + log_info " Parsing [PARSE_FAIL] entries..." + local parse_fail_count=0 + if [[ "$parse_fail_line_count" -gt 0 ]]; then + parse_parse_fail_batch "$temp_parse_fail" >> "$output_file" + parse_fail_count=$(grep -v '^#' "$output_file" | wc -l) + parse_fail_count="${parse_fail_count//[^0-9]/}" + fi + + # Parse write policy rejection entries using batch awk processing + log_info " Parsing write policy rejection entries..." + local write_policy_count=0 + if [[ "$write_policy_line_count" -gt 0 ]]; then + local before_count + before_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0") + before_count="${before_count//[^0-9]/}" + before_count="${before_count:-0}" + parse_write_policy_rejection_batch "$temp_write_policy_rejection" >> "$output_file" + local after_count + after_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0") + after_count="${after_count//[^0-9]/}" + after_count="${after_count:-0}" + write_policy_count=$((after_count - before_count)) + fi + + local invalid_announcement_count=$write_policy_count + + # Build enrichment lookup table from "Added rejected announcement" entries + local enrichment_lookup_file + enrichment_lookup_file=$(mktemp) + + log_info " Building enrichment lookup table..." + if [[ "$rejected_announcement_line_count" -gt 0 ]]; then + parse_rejected_announcement_batch "$temp_rejected_announcement" > "$enrichment_lookup_file" + fi + + rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" + + # Deduplicate by event_id (first column) - keep first occurrence + log_info " Deduplicating entries..." + local deduped_file + deduped_file=$(mktemp) + # Preserve header lines (starting with #) and deduplicate data lines + grep '^#' "$output_file" > "$deduped_file" + grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" + mv "$deduped_file" "$output_file" + + # Deduplicate enrichment lookup table by event_id + if [[ -s "$enrichment_lookup_file" ]]; then + sort -t$'\t' -k1,1 -u "$enrichment_lookup_file" > "$enrichment_lookup_file.deduped" + mv "$enrichment_lookup_file.deduped" "$enrichment_lookup_file" + fi + + # Enrich with repo/npub from "Added rejected announcement" log entries + # This is critical for usability - without it, action-required.txt shows + # event_id|kind instead of repo|npub, making parse failures unidentifiable + enrich_with_repo_npub "$output_file" "$enrichment_lookup_file" + + rm -f "$enrichment_lookup_file" + + # Count final entries (excluding header lines) + local count + count=$(grep -v '^#' "$output_file" | wc -l) + count="${count//[^0-9]/}" # Strip whitespace + count="${count:-0}" + + rm -f "$temp_file" + + # Summary + echo "" + log_info "=== Extraction Summary ===" + log_info "Service: $service" + log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" + log_success "Extracted $count total entries" + log_info " - [PARSE_FAIL] entries: $parse_fail_count" + log_info " - Invalid announcement rejections: $invalid_announcement_count" + echo "" + log_info "Output file: $output_file" + + if [[ $count -gt 0 ]]; then + echo "" + log_info "Sample entries (first 5):" + # Use a subshell to avoid SIGPIPE issues with set -e + # New format: event_idkindreasonreponpub + (grep -v '^#' "$output_file" | head -5 | while IFS=$'\t' read -r event_id kind reason repo npub; do + echo " kind=$kind event_id=${event_id:0:16}... reason=\"${reason:0:60}...\"" + done) || true + fi + + # Breakdown by kind + if [[ $count -gt 0 ]]; then + echo "" + log_info "Breakdown by event kind:" + # Use a subshell to avoid SIGPIPE issues with set -e + # kind is now column 2 + (grep -v '^#' "$output_file" | awk -F'\t' '{print $2}' | sort | uniq -c | sort -rn | while read -r cnt kind; do + echo " kind $kind: $cnt failures" + done) || true + fi + + # Breakdown by reason pattern (for invalid announcements) + if [[ $invalid_announcement_count -gt 0 ]]; then + echo "" + log_info "Breakdown by reason pattern:" + # Extract the main reason type (before the colon details) + (grep -v '^#' "$output_file" | awk -F'\t' '{print $3}' | sed 's/:.*//' | sort | uniq -c | sort -rn | head -10 | while read -r cnt reason; do + echo " $reason: $cnt" + done) || true + fi + + # Explicit success exit + exit 0 +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/31-extract-purgatory-expiry.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/31-extract-purgatory-expiry.sh new file mode 100755 index 0000000..a0c8ad0 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/31-extract-purgatory-expiry.sh @@ -0,0 +1,408 @@ +#!/usr/bin/env bash +# +# 31-extract-purgatory-expiry.sh - Extract purgatory expiry events from systemd logs +# +# PHASE 4b of the GRASP relay to ngit-grasp migration analysis pipeline. +# Extracts structured [PURGATORY_EXPIRED] log entries from journalctl. +# +# USAGE: +# ./31-extract-purgatory-expiry.sh [options] +# +# EXAMPLES: +# # Extract from ngit-grasp service (last 30 days, default) +# ./31-extract-purgatory-expiry.sh ngit-grasp.service output/logs +# +# # Extract with custom time range +# ./31-extract-purgatory-expiry.sh ngit-grasp.service output/logs --since "2026-01-01" +# +# # Extract from specific time window +# ./31-extract-purgatory-expiry.sh ngit-grasp.service output/logs --since "2026-01-15" --until "2026-01-22" +# +# OPTIONS: +# --since Start date for log extraction (default: 30 days ago) +# --until End date for log extraction (default: now) +# --dry-run Show what would be extracted without writing files +# +# OUTPUT: +# /purgatory-expired.txt +# +# OUTPUT FORMAT (TSV): +# reponpubtimestampreason +# +# EXPECTED LOG FORMAT: +# The script looks for structured log entries in this format: +# +# 2026-01-22T10:30:45Z ngit-grasp[1234]: [PURGATORY_EXPIRED] repo=myrepo npub=npub1... reason="clone URL unreachable after 7 days" +# +# Required fields: repo, npub +# Optional fields: reason (explains why purgatory expired) +# +# BACKGROUND: +# "Purgatory" is the state where ngit-grasp has received an announcement event +# but cannot yet sync the git data (e.g., clone URL unreachable, git server down). +# After a configurable timeout (default 7 days), the repository is marked as +# expired and removed from purgatory. +# +# Purgatory expiry during migration analysis indicates repositories that: +# - Had valid announcements on the production relay +# - Could not be synced to the archive relay +# - May need manual intervention or investigation +# +# DEPENDENCY: +# This script requires logging improvements in ngit-grasp to emit structured +# [PURGATORY_EXPIRED] log entries. Until those are implemented, this script +# will find no matching entries (which is handled gracefully). +# +# See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section) +# +# Expected Rust logging code: +# tracing::warn!( +# target: "migration", +# "[PURGATORY_EXPIRED] repo={} npub={} reason=\"{}\"", +# identifier, npub, reason +# ); +# +# PREREQUISITES: +# - journalctl (systemd) +# - grep, awk (standard Unix tools) +# - Access to systemd journal (may require sudo or journal group membership) +# +# RUNTIME: Depends on log volume, typically < 30 seconds +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 30-extract-parse-failures.sh - Companion script for parse failure logs +# + +set -euo pipefail + +# Get script directory for sourcing helpers +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source the service validation helper +if [[ -f "$SCRIPT_DIR/validate-service.sh" ]]; then + source "$SCRIPT_DIR/validate-service.sh" +fi + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 [options]" + echo "" + echo "Arguments:" + echo " service-name Systemd service name (e.g., ngit-grasp.service)" + echo " output-dir Directory to store extracted log data" + echo "" + echo "Options:" + echo " --since Start date (default: 30 days ago)" + echo " --until End date (default: now)" + echo " --dry-run Show what would be extracted without writing" + echo "" + echo "Examples:" + echo " $0 ngit-grasp.service output/logs" + echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" + echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" + echo "" + echo "Expected log format:" + echo " [PURGATORY_EXPIRED] repo=myrepo npub=npub1... reason=\"...\"" + exit 1 +} + +# Parse a single log line and extract fields +# Input: log line containing [PURGATORY_EXPIRED] +# Output: TSV line: reponpubtimestampreason +parse_log_line() { + local line="$1" + + # Extract timestamp from the beginning of the log line + # Format: 2026-01-22T10:30:45+0000 or similar ISO format + local timestamp repo npub reason + + # Extract ISO timestamp from beginning of line + timestamp=$(echo "$line" | grep -oP '^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}' || echo "") + + # Extract repo=VALUE (unquoted identifier) + repo=$(echo "$line" | grep -oP 'repo=\K[^ ]+' || echo "") + + # Extract npub=VALUE (npub1... format) + npub=$(echo "$line" | grep -oP 'npub=\K[^ ]+' || echo "") + + # Extract reason="VALUE" (quoted string, optional) + reason=$(echo "$line" | grep -oP 'reason="\K[^"]*' || echo "") + + # Only output if we have the required fields + if [[ -n "$repo" && -n "$npub" ]]; then + printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$timestamp" "$reason" + fi +} + +# Main +main() { + if [[ $# -lt 2 ]]; then + usage + fi + + local service="$1" + local output_dir="$2" + shift 2 + + # Default time range: last 30 days + local since_date + since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") + local until_date="" + local dry_run=false + + # Parse options + while [[ $# -gt 0 ]]; do + case "$1" in + --since) + since_date="$2" + shift 2 + ;; + --until) + until_date="$2" + shift 2 + ;; + --dry-run) + dry_run=true + shift + ;; + *) + log_error "Unknown option: $1" + usage + ;; + esac + done + + # Validate service name format + if [[ ! "$service" =~ \.service$ ]]; then + service="${service}.service" + fi + + # Validate service is appropriate for structured logging + # This prevents the common mistake of using ngit-relay instead of ngit-grasp + if type validate_service_for_structured_logging &>/dev/null; then + # Use non-interactive mode if not a terminal, skip log check (we'll do our own) + local interactive="true" + [[ ! -t 0 ]] && interactive="false" + + if ! validate_service_for_structured_logging "$service" "false" "$interactive"; then + log_error "Service validation failed. Use an ngit-grasp service for structured logging." + exit 1 + fi + else + # Fallback validation if helper not available + if [[ "$service" == *"ngit-relay"* ]]; then + log_error "Service name appears to be ngit-relay: $service" + log_error "Structured logging ([PURGATORY_EXPIRED]) only exists in ngit-grasp services." + log_error "Please use the ngit-grasp archive service instead." + log_error "" + log_error "To find the correct service:" + log_error " systemctl list-units 'ngit-grasp*' --all" + exit 1 + fi + fi + + log_info "Extracting purgatory expiry events from systemd logs" + log_info "Service: $service" + log_info "Output: $output_dir" + log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" + + # Check if journalctl is available + if ! command -v journalctl &> /dev/null; then + log_error "journalctl not found. This script requires systemd." + exit 1 + fi + + # Validate service exists (check if journalctl can find any logs for it) + # Note: We don't require the service to be running, just that it has logs + if ! journalctl --no-pager -u "$service" -n 1 &>/dev/null; then + log_warn "Could not query logs for service: $service" + log_warn "This may indicate the service doesn't exist or you lack permissions." + log_warn "" + log_warn "To list available ngit-grasp services:" + log_warn " systemctl list-units 'ngit-grasp*' --all" + log_warn " journalctl --list-boots # Check if you have journal access" + log_warn "" + # Continue anyway - the service might exist but have no logs yet + fi + + # Build journalctl command + local journal_cmd="journalctl -u $service --no-pager -o short-iso" + + if [[ -n "$since_date" ]]; then + journal_cmd="$journal_cmd --since '$since_date'" + fi + + if [[ -n "$until_date" ]]; then + journal_cmd="$journal_cmd --until '$until_date'" + fi + + log_info "Running: $journal_cmd | grep '\\[PURGATORY_EXPIRED\\]'" + + if [[ "$dry_run" == true ]]; then + log_info "[DRY RUN] Would extract to: $output_dir/purgatory-expired.txt" + + # Show sample of what would be extracted + log_info "Checking for matching log entries..." + local sample_count + sample_count=$(eval "$journal_cmd" 2>/dev/null | grep -c '\[PURGATORY_EXPIRED\]' || echo "0") + sample_count="${sample_count//[^0-9]/}" # Strip non-numeric characters + sample_count="${sample_count:-0}" + log_info "Found $sample_count matching log entries" + + if [[ "$sample_count" -eq 0 ]]; then + log_warn "No [PURGATORY_EXPIRED] entries found in logs." + log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." + log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" + fi + + exit 0 + fi + + # Create output directory + mkdir -p "$output_dir" + + local output_file="$output_dir/purgatory-expired.txt" + local temp_file + temp_file=$(mktemp) + + # Extract and parse log entries + log_info "Extracting log entries..." + + # Get raw log lines containing [PURGATORY_EXPIRED] + # Capture stderr separately to detect journalctl errors + local raw_lines journal_stderr journal_exit + local temp_stderr + temp_stderr=$(mktemp) + + raw_lines=$(eval "$journal_cmd" 2>"$temp_stderr" | grep '\[PURGATORY_EXPIRED\]' || true) + journal_exit=$? + journal_stderr=$(cat "$temp_stderr" 2>/dev/null || true) + rm -f "$temp_stderr" + + # Report any journalctl errors (but don't fail - empty logs are valid) + if [[ -n "$journal_stderr" ]]; then + log_warn "journalctl reported: $journal_stderr" + fi + + if [[ -z "$raw_lines" ]]; then + log_warn "No [PURGATORY_EXPIRED] entries found in logs." + log_warn "" + log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." + log_warn "The structured log format required by this script:" + log_warn "" + log_warn " [PURGATORY_EXPIRED] repo=myrepo npub=npub1... reason=\"...\"" + log_warn "" + log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" + log_warn "" + + # Create empty output file with header comment + { + echo "# Purgatory expiry events extracted from $service" + echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" + echo "# Extracted: $(date -Iseconds)" + echo "# Format: reponpubtimestampreason" + echo "#" + echo "# NOTE: No [PURGATORY_EXPIRED] entries found." + echo "# This is expected if ngit-grasp logging improvements are not yet deployed." + } > "$output_file" + + log_info "Created empty output file: $output_file" + exit 0 + fi + + # Write header + { + echo "# Purgatory expiry events extracted from $service" + echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" + echo "# Extracted: $(date -Iseconds)" + echo "# Format: reponpubtimestampreason" + } > "$output_file" + + # Parse each line + local count=0 + while IFS= read -r line; do + local parsed + parsed=$(parse_log_line "$line") + if [[ -n "$parsed" ]]; then + echo "$parsed" >> "$output_file" + count=$((count + 1)) + fi + done <<< "$raw_lines" + + rm -f "$temp_file" + + # Summary + echo "" + log_info "=== Extraction Summary ===" + log_info "Service: $service" + log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" + log_success "Extracted $count purgatory expiry entries" + echo "" + log_info "Output file: $output_file" + + if [[ $count -gt 0 ]]; then + echo "" + log_info "Sample entries (first 5):" + # Use a subshell to avoid SIGPIPE issues with set -e + (tail -n +5 "$output_file" | head -5 | while IFS=$'\t' read -r repo npub timestamp reason; do + echo " repo=$repo npub=${npub:0:20}... timestamp=$timestamp" + done) || true + fi + + # Show unique repos affected + if [[ $count -gt 0 ]]; then + echo "" + local unique_repos + unique_repos=$(tail -n +5 "$output_file" | awk -F'\t' '{print $1}' | sort -u | wc -l) + log_info "Unique repositories affected: $unique_repos" + + echo "" + log_info "Repositories with purgatory expiry:" + # Use a subshell to avoid SIGPIPE issues with set -e + (tail -n +5 "$output_file" | awk -F'\t' '{print $1}' | sort | uniq -c | sort -rn | head -10 | while read -r cnt repo; do + echo " $repo: $cnt expiry events" + done) || true + + local total_repos + total_repos=$(tail -n +5 "$output_file" | awk -F'\t' '{print $1}' | sort -u | wc -l) + if [[ $total_repos -gt 10 ]]; then + echo " ... and $((total_repos - 10)) more repositories" + fi + fi + + # Explicit success exit + exit 0 +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/40-classify-actions.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/40-classify-actions.sh new file mode 100755 index 0000000..8b61636 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/40-classify-actions.sh @@ -0,0 +1,662 @@ +#!/usr/bin/env bash +# +# 40-classify-actions.sh - Classify repos by migration action required +# +# Implements the redesigned classification system (Option B) with user feedback: +# +# Tier 1: No Action Required (ready-for-migration.txt) +# - Complete in both (prod=cat1, archive=cat1) +# - Deleted by user (kind 5 event) +# - Empty in prod (prod=cat2, any archive status) +# - Archive-only (archive=any, prod=missing) +# - Not in prod (purgatory-only, prod=missing) +# - Archive ahead (archive has newer git data than prod - GRASP enforced) +# +# Tier 2: Action Required (needs-resync.txt) +# - Complete in prod, missing from archive (with purgatory context) +# - Complete in prod, incomplete in archive AND prod is ahead (with purgatory context) +# +# Tier 3: Manual Investigation (manual-review.txt) +# - Partial in prod (prod=cat3) +# - No-match in prod (prod=cat4) +# - Parse failures +# - Conflicting states +# - Diverged git history (both have unique commits) +# +# KEY INSIGHT: +# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event. +# If archive has different/newer data than prod, it means: +# - A state event authorized those commits at some point +# - Archive is actually MORE up-to-date than prod +# - Migration should use archive data (it's already correct) +# +# Usage: ./40-classify-actions.sh +# +# Output format: repo | npub | prod_status | archive_status | context | action +# + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { echo -e "${BLUE}[INFO]${NC} $*"; } +log_success() { echo -e "${GREEN}[OK]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } + +# Check arguments +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + echo "Example: $0 work/migration-analysis-20260123-200701" + exit 1 +fi + +ANALYSIS_DIR="$1" + +# Validate analysis directory +if [[ ! -d "$ANALYSIS_DIR" ]]; then + log_error "Analysis directory not found: $ANALYSIS_DIR" + exit 1 +fi + +# Define paths +PROD_DIR="$ANALYSIS_DIR/prod" +ARCHIVE_DIR="$ANALYSIS_DIR/archive" +COMPARISON_DIR="$ANALYSIS_DIR/comparison" +LOGS_DIR="$ANALYSIS_DIR/logs" +RESULTS_DIR="$ANALYSIS_DIR/results" + +# Validate required directories +for dir in "$PROD_DIR" "$ARCHIVE_DIR" "$COMPARISON_DIR" "$LOGS_DIR"; do + if [[ ! -d "$dir" ]]; then + log_error "Required directory not found: $dir" + exit 1 + fi +done + +# Create results directory +mkdir -p "$RESULTS_DIR" + +# Output files +READY_FILE="$RESULTS_DIR/ready-for-migration.txt" +RESYNC_FILE="$RESULTS_DIR/needs-resync.txt" +REVIEW_FILE="$RESULTS_DIR/manual-review.txt" +SUMMARY_FILE="$RESULTS_DIR/summary.txt" + +# Temporary files for processing +TMP_DIR=$(mktemp -d) +trap 'rm -rf "$TMP_DIR"' EXIT + +log_info "Starting classification with revised system (Option B)" +log_info "Analysis directory: $ANALYSIS_DIR" + +# ============================================================================ +# Phase 1: Build lookup tables from source data +# ============================================================================ + +log_info "Building lookup tables..." + +# Build prod category lookup: repo|npub -> category +declare -A PROD_CAT +while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do + repo="${repo// /}" # Remove all spaces + npub="${npub// /}" # Remove all spaces + [[ -z "$repo" || -z "$npub" ]] && continue + PROD_CAT["$repo|$npub"]="cat1" +done < "$PROD_DIR/category1-complete-match.txt" + +while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + PROD_CAT["$repo|$npub"]="cat2" +done < "$PROD_DIR/category2-empty-blank.txt" + +while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + PROD_CAT["$repo|$npub"]="cat3" +done < "$PROD_DIR/category3-partial-match.txt" + +while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + PROD_CAT["$repo|$npub"]="cat4" +done < "$PROD_DIR/category4-no-match.txt" + +log_info "Loaded ${#PROD_CAT[@]} prod entries" + +# Build archive category lookup: repo|npub -> category +declare -A ARCHIVE_CAT +while IFS='|' read -r repo npub rest; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + ARCHIVE_CAT["$repo|$npub"]="cat1" +done < "$ARCHIVE_DIR/category1-complete-match.txt" + +while IFS='|' read -r repo npub rest; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + ARCHIVE_CAT["$repo|$npub"]="cat2" +done < "$ARCHIVE_DIR/category2-empty-blank.txt" + +while IFS='|' read -r repo npub rest; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + ARCHIVE_CAT["$repo|$npub"]="cat3" +done < "$ARCHIVE_DIR/category3-partial-match.txt" + +while IFS='|' read -r repo npub rest; do + repo="${repo// /}" + npub="${npub// /}" + [[ -z "$repo" || -z "$npub" ]] && continue + ARCHIVE_CAT["$repo|$npub"]="cat4" +done < "$ARCHIVE_DIR/category4-no-match.txt" + +log_info "Loaded ${#ARCHIVE_CAT[@]} archive entries" + +# Build purgatory lookup: repo|npub -> 1 (if purgatory expired) +declare -A PURGATORY +PURGATORY_COUNT=0 +if [[ -f "$LOGS_DIR/purgatory-expired.txt" ]]; then + while IFS=$'\t' read -r repo npub timestamp reason || [[ -n "$repo" ]]; do + # Skip comments and empty lines + [[ "$repo" =~ ^# ]] && continue + [[ -z "$repo" || -z "$npub" ]] && continue + PURGATORY["$repo|$npub"]=1 + PURGATORY_COUNT=$((PURGATORY_COUNT + 1)) + done < "$LOGS_DIR/purgatory-expired.txt" +fi +log_info "Loaded $PURGATORY_COUNT purgatory entries" + +# Build parse failure lookup: repo|npub -> 1 (if parse failure logged) +# Parse failures file format: event_idkindreasonreponpub +declare -A PARSE_FAIL +PARSE_FAIL_COUNT=0 +if [[ -f "$LOGS_DIR/parse-failures.txt" ]]; then + while IFS=$'\t' read -r event_id kind reason repo npub || [[ -n "$event_id" ]]; do + # Skip comments and empty lines + [[ "$event_id" =~ ^# ]] && continue + [[ -z "$repo" || -z "$npub" ]] && continue + PARSE_FAIL["$repo|$npub"]=1 + PARSE_FAIL_COUNT=$((PARSE_FAIL_COUNT + 1)) + done < "$LOGS_DIR/parse-failures.txt" +fi +log_info "Loaded $PARSE_FAIL_COUNT parse failure entries" + +# Build deletion lookup: repo|npub -> 1 (if kind 5 deletion event) +# Deletions are in NDJSON format with "a" tags like "30617:pubkey_hex:repo" +# We need to convert hex pubkeys to npub format using nak +declare -A DELETED + +# Helper function to process deletion file (NDJSON format) +# Extracts unique pubkey_hex:repo pairs and converts to npub +process_deletions() { + local file="$1" + [[ ! -f "$file" ]] && return + + # Extract unique pubkey_hex|repo pairs from NDJSON + # Each line is a JSON object, extract "a" tags + local pairs + pairs=$(jq -r '.tags[] | select(.[0] == "a") | .[1]' "$file" 2>/dev/null | \ + sed 's/^30617://' | awk -F: '{print $1 "|" $2}' | sort -u) + + # Get unique hex pubkeys for batch conversion + local hex_keys + hex_keys=$(echo "$pairs" | cut -d'|' -f1 | sort -u) + + # Build hex->npub lookup via batch nak call + declare -A HEX_TO_NPUB + while read -r hex; do + [[ -z "$hex" ]] && continue + local npub + npub=$(nak encode npub "$hex" 2>/dev/null || echo "") + [[ -n "$npub" ]] && HEX_TO_NPUB["$hex"]="$npub" + done <<< "$hex_keys" + + # Now process pairs with cached npub values + while IFS='|' read -r pubkey_hex repo; do + [[ -z "$repo" || -z "$pubkey_hex" ]] && continue + local npub="${HEX_TO_NPUB[$pubkey_hex]:-}" + [[ -z "$npub" ]] && continue + DELETED["$repo|$npub"]=1 + done <<< "$pairs" +} + +# Process prod and archive deletions +process_deletions "$PROD_DIR/raw/deletions.json" +process_deletions "$ARCHIVE_DIR/raw/deletions.json" +DELETED_COUNT=0 +[[ ${#DELETED[@]} -gt 0 ]] && DELETED_COUNT=${#DELETED[@]} +log_info "Loaded $DELETED_COUNT deletion entries" + +# Build git ancestry lookup: repo|npub -> relationship (archive-ahead, prod-ahead, diverged, etc.) +# This data comes from 22-compare-git-data.sh which compares actual git commits +declare -A GIT_ANCESTRY +GIT_ANCESTRY_COUNT=0 +if [[ -f "$COMPARISON_DIR/git-ancestry.tsv" ]]; then + while IFS=$'\t' read -r repo npub relationship details || [[ -n "$repo" ]]; do + # Skip header and comments + [[ "$repo" == "repo" ]] && continue + [[ "$repo" =~ ^# ]] && continue + [[ -z "$repo" || -z "$npub" ]] && continue + GIT_ANCESTRY["$repo|$npub"]="$relationship" + GIT_ANCESTRY_COUNT=$((GIT_ANCESTRY_COUNT + 1)) + done < "$COMPARISON_DIR/git-ancestry.tsv" + log_info "Loaded $GIT_ANCESTRY_COUNT git ancestry entries" +else + log_warn "No git-ancestry.tsv found - will not check if archive is ahead of prod" + log_warn "Run 22-compare-git-data.sh to enable archive-ahead detection" +fi + +# ============================================================================ +# Phase 2: Build unique repo list from all sources +# ============================================================================ + +log_info "Building unique repo list..." + +declare -A ALL_REPOS +for key in "${!PROD_CAT[@]}"; do + ALL_REPOS["$key"]=1 +done +for key in "${!ARCHIVE_CAT[@]}"; do + ALL_REPOS["$key"]=1 +done +for key in "${!PURGATORY[@]}"; do + ALL_REPOS["$key"]=1 +done + +log_info "Total unique repos: ${#ALL_REPOS[@]}" + +# ============================================================================ +# Phase 3: Classify each repo according to revised decision tree +# ============================================================================ + +log_info "Classifying repos..." + +# Counters for summary +declare -A COUNTS +COUNTS[ready_complete_both]=0 +COUNTS[ready_deleted]=0 +COUNTS[ready_empty_prod]=0 +COUNTS[ready_archive_only]=0 +COUNTS[ready_not_in_prod]=0 +COUNTS[ready_archive_ahead]=0 +COUNTS[resync_missing_archive]=0 +COUNTS[resync_incomplete_archive]=0 +COUNTS[review_partial_prod]=0 +COUNTS[review_nomatch_prod]=0 +COUNTS[review_parse_failure]=0 +COUNTS[review_conflicting]=0 +COUNTS[review_diverged]=0 + +# Output arrays +declare -a READY_LINES +declare -a RESYNC_LINES +declare -a REVIEW_LINES + +# Helper function to get context string +get_context() { + local key="$1" + local prod_status="$2" + local archive_status="$3" + local context="" + + # Check purgatory + if [[ -n "${PURGATORY[$key]:-}" ]]; then + context="purgatory-expired" + fi + + # Check parse failure + if [[ -n "${PARSE_FAIL[$key]:-}" ]]; then + if [[ -n "$context" ]]; then + context="$context, parse-failure" + else + context="parse-failure" + fi + fi + + # Add archive context for unexpected states + if [[ "$prod_status" == "empty" && "$archive_status" != "missing" && "$archive_status" != "empty" ]]; then + if [[ -n "$context" ]]; then + context="$context, archive-has-data" + else + context="archive-has-data" + fi + fi + + echo "${context:-none}" +} + +# Helper to convert category to human-readable status +cat_to_status() { + case "$1" in + cat1) echo "complete" ;; + cat2) echo "empty" ;; + cat3) echo "partial" ;; + cat4) echo "no-match" ;; + missing) echo "missing" ;; + *) echo "$1" ;; + esac +} + +LOOP_COUNT=0 +for key in "${!ALL_REPOS[@]}"; do + LOOP_COUNT=$((LOOP_COUNT + 1)) + [[ $((LOOP_COUNT % 100)) -eq 0 ]] && log_info "Processed $LOOP_COUNT repos..." + IFS='|' read -r repo npub <<< "$key" + + prod_cat="${PROD_CAT[$key]:-missing}" + archive_cat="${ARCHIVE_CAT[$key]:-missing}" + prod_status=$(cat_to_status "$prod_cat") + archive_status=$(cat_to_status "$archive_cat") + + # Decision tree implementation + + # 1. Is there a kind 5 deletion event? + if [[ -n "${DELETED[$key]:-}" ]]; then + context=$(get_context "$key" "$prod_status" "$archive_status") + READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | deleted by user") + COUNTS[ready_deleted]=$((COUNTS[ready_deleted] + 1)) + continue + fi + + # 2. What is the prod status? + case "$prod_cat" in + missing) + # Not in prod + if [[ "$archive_cat" != "missing" ]]; then + # In archive but not in prod -> no action (archive-only) + context=$(get_context "$key" "$prod_status" "$archive_status") + READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | archive-only (not in prod)") + COUNTS[ready_archive_only]=$((COUNTS[ready_archive_only] + 1)) + elif [[ -n "${PURGATORY[$key]:-}" ]]; then + # Purgatory only, not in prod -> no action + context="purgatory-expired" + READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | purgatory-only (not in prod)") + COUNTS[ready_not_in_prod]=$((COUNTS[ready_not_in_prod] + 1)) + fi + # Otherwise skip (not a real repo - no data anywhere) + ;; + + cat2) + # Empty in prod -> ALWAYS no action required + context=$(get_context "$key" "$prod_status" "$archive_status") + READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | empty in prod (user never pushed)") + COUNTS[ready_empty_prod]=$((COUNTS[ready_empty_prod] + 1)) + ;; + + cat1) + # Complete in prod + if [[ "$archive_cat" == "cat1" ]]; then + # Complete in both -> no action + context=$(get_context "$key" "$prod_status" "$archive_status") + READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | complete in both") + COUNTS[ready_complete_both]=$((COUNTS[ready_complete_both] + 1)) + else + # Complete in prod, missing/incomplete in archive + # Check for parse failure - if so, needs manual review + if [[ -n "${PARSE_FAIL[$key]:-}" ]]; then + context=$(get_context "$key" "$prod_status" "$archive_status") + REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | complete in prod with parse failure") + COUNTS[review_parse_failure]=$((COUNTS[review_parse_failure] + 1)) + else + # Check git ancestry to see if archive is actually ahead + git_relationship="${GIT_ANCESTRY[$key]:-unknown}" + + if [[ "$git_relationship" == "archive-ahead" || "$git_relationship" == "in-sync" ]]; then + # Archive has newer/same git data - this is GOOD + # Archive's git data was authorized by a state event (GRASP enforced) + context=$(get_context "$key" "$prod_status" "$archive_status") + if [[ -n "$context" && "$context" != "none" ]]; then + context="$context, git=$git_relationship" + else + context="git=$git_relationship" + fi + READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | archive ahead (use archive data)") + COUNTS[ready_archive_ahead]=$((COUNTS[ready_archive_ahead] + 1)) + elif [[ "$git_relationship" == "diverged" ]]; then + # Git histories diverged - needs manual review + context=$(get_context "$key" "$prod_status" "$archive_status") + if [[ -n "$context" && "$context" != "none" ]]; then + context="$context, git=diverged" + else + context="git=diverged" + fi + REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | git histories diverged (manual review)") + COUNTS[review_diverged]=$((COUNTS[review_diverged] + 1)) + else + # prod-ahead, archive-only, prod-only, both-empty, or unknown + # These need resync - include purgatory context + context=$(get_context "$key" "$prod_status" "$archive_status") + if [[ "$git_relationship" != "unknown" ]]; then + if [[ -n "$context" && "$context" != "none" ]]; then + context="$context, git=$git_relationship" + else + context="git=$git_relationship" + fi + fi + if [[ "$archive_cat" == "missing" ]]; then + RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync to archive") + COUNTS[resync_missing_archive]=$((COUNTS[resync_missing_archive] + 1)) + else + RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync (archive incomplete)") + COUNTS[resync_incomplete_archive]=$((COUNTS[resync_incomplete_archive] + 1)) + fi + fi + fi + fi + ;; + + cat3) + # Partial in prod -> ALWAYS manual investigation + context=$(get_context "$key" "$prod_status" "$archive_status") + REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | partial in prod (investigate git data)") + COUNTS[review_partial_prod]=$((COUNTS[review_partial_prod] + 1)) + ;; + + cat4) + # No-match in prod -> ALWAYS manual investigation + context=$(get_context "$key" "$prod_status" "$archive_status") + REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | no-match in prod (git corruption)") + COUNTS[review_nomatch_prod]=$((COUNTS[review_nomatch_prod] + 1)) + ;; + esac +done + +# ============================================================================ +# Phase 4: Write output files +# ============================================================================ + +log_info "Writing output files..." + +TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S+00:00") + +# Write ready-for-migration.txt +{ + echo "# Ready for Migration - No action required" + echo "# Generated: $TIMESTAMP" + echo "# Format: repo | npub | prod_status | archive_status | context | reason" + echo "#" + for line in "${READY_LINES[@]}"; do + echo "$line" + done +} > "$READY_FILE" + +# Write needs-resync.txt +{ + echo "# Needs Re-sync - Action required" + echo "# Generated: $TIMESTAMP" + echo "# Format: repo | npub | prod_status | archive_status | context | action" + echo "#" + echo "# Context meanings:" + echo "# purgatory-expired = archive tried to sync but failed (30min timeout)" + echo "# none = archive never tried or announcement missing" + echo "#" + for line in "${RESYNC_LINES[@]}"; do + echo "$line" + done +} > "$RESYNC_FILE" + +# Write manual-review.txt +{ + echo "# Manual Review Required - Investigation needed" + echo "# Generated: $TIMESTAMP" + echo "# Format: repo | npub | prod_status | archive_status | context | reason" + echo "#" + for line in "${REVIEW_LINES[@]}"; do + echo "$line" + done +} > "$REVIEW_FILE" + +# ============================================================================ +# Phase 5: Generate summary +# ============================================================================ + +log_info "Generating summary..." + +TOTAL_READY="${#READY_LINES[@]}" +TOTAL_RESYNC="${#RESYNC_LINES[@]}" +TOTAL_REVIEW="${#REVIEW_LINES[@]}" +TOTAL=$((TOTAL_READY + TOTAL_RESYNC + TOTAL_REVIEW)) + +# Calculate percentages +if [[ $TOTAL -gt 0 ]]; then + PCT_READY=$(awk "BEGIN {printf \"%.1f\", ($TOTAL_READY / $TOTAL) * 100}") + PCT_RESYNC=$(awk "BEGIN {printf \"%.1f\", ($TOTAL_RESYNC / $TOTAL) * 100}") + PCT_REVIEW=$(awk "BEGIN {printf \"%.1f\", ($TOTAL_REVIEW / $TOTAL) * 100}") +else + PCT_READY="0.0" + PCT_RESYNC="0.0" + PCT_REVIEW="0.0" +fi + +{ + echo "# Migration Classification Summary" + echo "Generated: $TIMESTAMP" + echo "Analysis Directory: $ANALYSIS_DIR" + echo "" + echo "## Overview" + echo "" + echo "| Category | Count | Percentage |" + echo "|----------|-------|------------|" + echo "| Ready for Migration | $TOTAL_READY | $PCT_READY% |" + echo "| Needs Re-sync | $TOTAL_RESYNC | $PCT_RESYNC% |" + echo "| Manual Review | $TOTAL_REVIEW | $PCT_REVIEW% |" + echo "| **Total** | **$TOTAL** | **100%** |" + echo "" + echo "## Tier 1: Ready for Migration ($TOTAL_READY repos)" + echo "" + echo "These repositories are ready for migration or don't need migration:" + echo "" + echo "| Reason | Count |" + echo "|--------|-------|" + echo "| complete in both prod and archive | ${COUNTS[ready_complete_both]} |" + echo "| archive ahead (has newer git data) | ${COUNTS[ready_archive_ahead]} |" + echo "| deleted by user | ${COUNTS[ready_deleted]} |" + echo "| empty in prod (user never pushed) | ${COUNTS[ready_empty_prod]} |" + echo "| archive-only (not in prod) | ${COUNTS[ready_archive_only]} |" + echo "| purgatory-only (not in prod) | ${COUNTS[ready_not_in_prod]} |" + echo "" + echo "## Tier 2: Needs Re-sync ($TOTAL_RESYNC repos)" + echo "" + echo "These repositories need re-sync to archive before migration:" + echo "" + echo "| Reason | Count | Action |" + echo "|--------|-------|--------|" + echo "| complete in prod, missing from archive | ${COUNTS[resync_missing_archive]} | trigger re-sync |" + echo "| complete in prod, incomplete in archive | ${COUNTS[resync_incomplete_archive]} | trigger re-sync |" + echo "" + echo "### Purgatory Context" + echo "" + echo "Repos in needs-resync.txt include purgatory context:" + echo "- **purgatory-expired**: Archive tried to sync but failed (30min timeout)" + echo "- **none**: Archive never tried or announcement missing" + echo "" + echo "## Tier 3: Manual Review ($TOTAL_REVIEW repos)" + echo "" + echo "These repositories require human investigation:" + echo "" + echo "| Reason | Count |" + echo "|--------|-------|" + echo "| partial in prod (cat3) | ${COUNTS[review_partial_prod]} |" + echo "| no-match in prod (cat4) | ${COUNTS[review_nomatch_prod]} |" + echo "| complete in prod with parse failure | ${COUNTS[review_parse_failure]} |" + echo "| git histories diverged | ${COUNTS[review_diverged]} |" + echo "" + echo "## Input Data Summary" + echo "" + echo "### Prod Categories" + echo "- Category 1 (complete): $(wc -l < "$PROD_DIR/category1-complete-match.txt")" + echo "- Category 2 (empty): $(wc -l < "$PROD_DIR/category2-empty-blank.txt")" + echo "- Category 3 (partial): $(wc -l < "$PROD_DIR/category3-partial-match.txt")" + echo "- Category 4 (no match): $(wc -l < "$PROD_DIR/category4-no-match.txt")" + echo "" + echo "### Archive Categories" + echo "- Category 1 (complete): $(wc -l < "$ARCHIVE_DIR/category1-complete-match.txt")" + echo "- Category 2 (empty): $(wc -l < "$ARCHIVE_DIR/category2-empty-blank.txt")" + echo "- Category 3 (partial): $(wc -l < "$ARCHIVE_DIR/category3-partial-match.txt")" + echo "- Category 4 (no match): $(wc -l < "$ARCHIVE_DIR/category4-no-match.txt")" + echo "" + echo "### Logs" + echo "- Parse failures: $(grep -c -v '^#' "$LOGS_DIR/parse-failures.txt" 2>/dev/null || echo 0)" + echo "- Purgatory expired: $(grep -c -v '^#' "$LOGS_DIR/purgatory-expired.txt" 2>/dev/null || echo 0)" + echo "" + echo "## Output Files" + echo "" + echo "- \`results/ready-for-migration.txt\` - $TOTAL_READY repos ready for migration" + echo "- \`results/needs-resync.txt\` - $TOTAL_RESYNC repos needing re-sync" + echo "- \`results/manual-review.txt\` - $TOTAL_REVIEW repos needing investigation" + echo "- \`results/summary.txt\` - This summary file" + echo "" + echo "## Recommended Next Steps" + echo "" + echo "1. **Review needs-resync.txt** - Trigger re-sync for these repos" + echo "2. **Review manual-review.txt** - Investigate unusual states" + echo "3. **Verify ready-for-migration.txt** - Spot-check a few repos" + echo "4. **Plan migration window** - Schedule cutover when action items resolved" +} > "$SUMMARY_FILE" + +# ============================================================================ +# Phase 6: Print summary to console +# ============================================================================ + +echo "" +log_success "Classification complete!" +echo "" +echo "=== Summary ===" +echo "Ready for Migration: $TOTAL_READY ($PCT_READY%)" +echo " - Complete in both: ${COUNTS[ready_complete_both]}" +echo " - Archive ahead: ${COUNTS[ready_archive_ahead]}" +echo " - Deleted by user: ${COUNTS[ready_deleted]}" +echo " - Empty in prod: ${COUNTS[ready_empty_prod]}" +echo " - Archive-only: ${COUNTS[ready_archive_only]}" +echo " - Purgatory-only: ${COUNTS[ready_not_in_prod]}" +echo "" +echo "Needs Re-sync: $TOTAL_RESYNC ($PCT_RESYNC%)" +echo " - Missing from archive: ${COUNTS[resync_missing_archive]}" +echo " - Incomplete in archive: ${COUNTS[resync_incomplete_archive]}" +echo "" +echo "Manual Review: $TOTAL_REVIEW ($PCT_REVIEW%)" +echo " - Partial in prod: ${COUNTS[review_partial_prod]}" +echo " - No-match in prod: ${COUNTS[review_nomatch_prod]}" +echo " - Parse failures: ${COUNTS[review_parse_failure]}" +echo " - Git diverged: ${COUNTS[review_diverged]}" +echo "" +echo "Total: $TOTAL repos" +echo "" +echo "Output files:" +echo " $READY_FILE" +echo " $RESYNC_FILE" +echo " $REVIEW_FILE" +echo " $SUMMARY_FILE" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/run-migration-analysis.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/run-migration-analysis.sh new file mode 100755 index 0000000..acc5e44 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/run-migration-analysis.sh @@ -0,0 +1,779 @@ +#!/usr/bin/env bash +# +# run-migration-analysis.sh - Orchestrate the complete GRASP relay to ngit-grasp migration analysis +# +# This script runs all 5 phases of the migration analysis pipeline in sequence, +# with proper error handling, progress reporting, and timing information. +# +# QUICK START: +# # Basic usage (local analysis only - Phases 1, 3, 5) +# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev +# +# # Full analysis including git sync check (requires VPS access) +# ./run-migration-analysis.sh \ +# --prod-relay wss://relay.ngit.dev \ +# --archive-relay wss://archive.relay.ngit.dev \ +# --prod-git /var/lib/grasp-relay/git \ +# --archive-git /var/lib/ngit-grasp/git +# +# USAGE: +# ./run-migration-analysis.sh [options] +# +# REQUIRED OPTIONS: +# --prod-relay Production relay WebSocket URL (e.g., wss://relay.ngit.dev) +# --archive-relay Archive relay WebSocket URL (e.g., wss://archive.relay.ngit.dev) +# +# OPTIONAL OPTIONS: +# --prod-git Git base directory for prod (enables Phase 2) +# --archive-git Git base directory for archive (enables Phase 2) +# --service Systemd service name for log extraction (enables Phase 4) +# --output Output directory (default: work/migration-analysis-YYYYMMDD-HHMM) +# --since Start date for log extraction (default: 30 days ago) +# --until End date for log extraction (default: now) +# +# PHASE CONTROL: +# --skip-phase-1 Skip event fetching (use existing data) +# --skip-phase-2 Skip git sync check (use existing data) +# --skip-phase-3 Skip categorization (use existing data) +# --skip-phase-4 Skip log extraction (use existing data) +# --skip-phase-5 Skip final classification +# --only-phase-N Run only phase N (1-5) +# --from-phase-N Start from phase N (skip earlier phases) +# +# OTHER OPTIONS: +# --dry-run Show what would be executed without running +# --continue-on-error Continue to next phase even if current phase fails +# --help Show this help message +# +# PHASES: +# Phase 1: Fetch events from both relays (~30s each, local) +# Phase 2: Check git sync status (~20 min each, requires VPS) +# Phase 3: Categorize and compare results (fast, local) +# Phase 4: Extract logs from systemd (requires VPS) +# Phase 5: Final classification (fast, local) +# +# EXAMPLES: +# # Dry run to see what would happen +# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev --dry-run +# +# # Run only Phase 1 (fetch events) +# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev --only-phase-1 +# +# # Resume from Phase 3 using existing Phase 1-2 data +# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev --from-phase-3 --output work/migration-analysis-20260122-1430 +# +# # Full analysis on VPS with all features +# ./run-migration-analysis.sh \ +# --prod-relay wss://relay.ngit.dev \ +# --archive-relay wss://archive.relay.ngit.dev \ +# --prod-git /var/lib/grasp-relay/git \ +# --archive-git /var/lib/ngit-grasp/git \ +# --service ngit-grasp.service +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# + +set -euo pipefail + +# Get script directory for finding other scripts +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + CYAN='\033[0;36m' + BOLD='\033[1m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + CYAN='' + BOLD='' + NC='' +fi + +# Logging functions +log_header() { + echo "" + echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════════${NC}" + echo -e "${BOLD}${CYAN} $*${NC}" + echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════════${NC}" + echo "" +} + +log_phase() { + echo "" + echo -e "${BOLD}${BLUE}┌──────────────────────────────────────────────────────────────┐${NC}" + echo -e "${BOLD}${BLUE}│ $*${NC}" + echo -e "${BOLD}${BLUE}└──────────────────────────────────────────────────────────────┘${NC}" +} + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_step() { + echo -e "${CYAN} →${NC} $*" >&2 +} + +# Default values +PROD_RELAY="" +ARCHIVE_RELAY="" +PROD_GIT="" +ARCHIVE_GIT="" +SERVICE_NAME="" +OUTPUT_DIR="" +DRY_RUN=false +CONTINUE_ON_ERROR=false +LOG_SINCE="" +LOG_UNTIL="" + +# Phase control +SKIP_PHASE_1=false +SKIP_PHASE_2=false +SKIP_PHASE_3=false +SKIP_PHASE_4=false +SKIP_PHASE_5=false +ONLY_PHASE="" +FROM_PHASE="" + +# Timing +declare -A PHASE_TIMES + +usage() { + head -73 "$0" | tail -n +3 | sed 's/^# //' | sed 's/^#//' + exit 0 +} + +# Parse command line arguments +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --prod-relay) + PROD_RELAY="$2" + shift 2 + ;; + --archive-relay) + ARCHIVE_RELAY="$2" + shift 2 + ;; + --prod-git) + PROD_GIT="$2" + shift 2 + ;; + --archive-git) + ARCHIVE_GIT="$2" + shift 2 + ;; + --service) + SERVICE_NAME="$2" + shift 2 + ;; + --output) + OUTPUT_DIR="$2" + shift 2 + ;; + --skip-phase-1) + SKIP_PHASE_1=true + shift + ;; + --skip-phase-2) + SKIP_PHASE_2=true + shift + ;; + --skip-phase-3) + SKIP_PHASE_3=true + shift + ;; + --skip-phase-4) + SKIP_PHASE_4=true + shift + ;; + --skip-phase-5) + SKIP_PHASE_5=true + shift + ;; + --only-phase-1|--only-phase-2|--only-phase-3|--only-phase-4|--only-phase-5) + ONLY_PHASE="${1#--only-phase-}" + shift + ;; + --from-phase-1|--from-phase-2|--from-phase-3|--from-phase-4|--from-phase-5) + FROM_PHASE="${1#--from-phase-}" + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --continue-on-error) + CONTINUE_ON_ERROR=true + shift + ;; + --since) + LOG_SINCE="$2" + shift 2 + ;; + --until) + LOG_UNTIL="$2" + shift 2 + ;; + --help|-h) + usage + ;; + *) + log_error "Unknown option: $1" + echo "Use --help for usage information." + exit 1 + ;; + esac + done +} + +# Validate required arguments +validate_args() { + local errors=0 + + if [[ -z "$PROD_RELAY" ]]; then + log_error "Missing required option: --prod-relay" + errors=1 + fi + + if [[ -z "$ARCHIVE_RELAY" ]]; then + log_error "Missing required option: --archive-relay" + errors=1 + fi + + # Validate relay URLs + if [[ -n "$PROD_RELAY" && ! "$PROD_RELAY" =~ ^wss?:// ]]; then + log_error "Invalid prod relay URL: $PROD_RELAY (must start with ws:// or wss://)" + errors=1 + fi + + if [[ -n "$ARCHIVE_RELAY" && ! "$ARCHIVE_RELAY" =~ ^wss?:// ]]; then + log_error "Invalid archive relay URL: $ARCHIVE_RELAY (must start with ws:// or wss://)" + errors=1 + fi + + # Validate git paths if provided + if [[ -n "$PROD_GIT" && ! -d "$PROD_GIT" ]]; then + log_warn "Prod git directory not found: $PROD_GIT" + log_warn "Phase 2 will fail unless running on VPS with access to this path." + fi + + if [[ -n "$ARCHIVE_GIT" && ! -d "$ARCHIVE_GIT" ]]; then + log_warn "Archive git directory not found: $ARCHIVE_GIT" + log_warn "Phase 2 will fail unless running on VPS with access to this path." + fi + + if [[ $errors -eq 1 ]]; then + echo "" + echo "Use --help for usage information." + exit 1 + fi +} + +# Check prerequisites +check_prerequisites() { + local missing=0 + + log_info "Checking prerequisites..." + + # Required tools + for tool in git nak jq awk sort; do + if command -v "$tool" &> /dev/null; then + log_step "$tool: found" + else + log_error "$tool: NOT FOUND" + missing=1 + fi + done + + # Optional tools + if command -v journalctl &> /dev/null; then + log_step "journalctl: found (Phase 4 available)" + else + log_step "journalctl: not found (Phase 4 will be skipped)" + SKIP_PHASE_4=true + fi + + if [[ $missing -eq 1 ]]; then + log_error "Missing required tools. Install them and try again." + exit 1 + fi + + # Check scripts exist + for script in 01-fetch-events.sh 10-check-git-sync.sh 20-categorize.sh 21-compare-relays.sh 22-compare-git-data.sh 30-extract-parse-failures.sh 31-extract-purgatory-expiry.sh 40-classify-actions.sh; do + if [[ ! -x "$SCRIPT_DIR/$script" ]]; then + log_error "Script not found or not executable: $SCRIPT_DIR/$script" + missing=1 + fi + done + + if [[ $missing -eq 1 ]]; then + exit 1 + fi + + log_success "All prerequisites satisfied" +} + +# Determine which phases to run +determine_phases() { + # Handle --only-phase-N + if [[ -n "$ONLY_PHASE" ]]; then + for i in 1 2 3 4 5; do + if [[ "$i" != "$ONLY_PHASE" ]]; then + eval "SKIP_PHASE_$i=true" + fi + done + fi + + # Handle --from-phase-N + if [[ -n "$FROM_PHASE" ]]; then + for i in 1 2 3 4 5; do + if [[ "$i" -lt "$FROM_PHASE" ]]; then + eval "SKIP_PHASE_$i=true" + fi + done + fi + + # Auto-skip Phase 2 if git paths not provided + if [[ -z "$PROD_GIT" && -z "$ARCHIVE_GIT" ]]; then + if [[ "$SKIP_PHASE_2" != "true" ]]; then + log_warn "No git paths provided. Phase 2 (git sync check) will be skipped." + log_warn "Use --prod-git and --archive-git to enable Phase 2." + SKIP_PHASE_2=true + fi + fi + + # Auto-skip Phase 4 if service not provided + if [[ -z "$SERVICE_NAME" ]]; then + if [[ "$SKIP_PHASE_4" != "true" ]]; then + log_warn "No service name provided. Phase 4 (log extraction) will be skipped." + log_warn "Use --service to enable Phase 4." + SKIP_PHASE_4=true + fi + fi +} + +# Setup output directory +setup_output_dir() { + if [[ -z "$OUTPUT_DIR" ]]; then + OUTPUT_DIR="work/migration-analysis-$(date +%Y%m%d-%H%M)" + fi + + log_info "Output directory: $OUTPUT_DIR" + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would create directory structure" + return + fi + + mkdir -p "$OUTPUT_DIR"/{prod/raw,archive/raw,comparison,logs,results} + + # Save configuration + cat > "$OUTPUT_DIR/config.txt" << EOF +# Migration Analysis Configuration +# Generated: $(date -Iseconds) + +PROD_RELAY=$PROD_RELAY +ARCHIVE_RELAY=$ARCHIVE_RELAY +PROD_GIT=$PROD_GIT +ARCHIVE_GIT=$ARCHIVE_GIT +SERVICE_NAME=$SERVICE_NAME +OUTPUT_DIR=$OUTPUT_DIR +EOF + + log_success "Created output directory structure" +} + +# Run a phase with timing and error handling +run_phase() { + local phase_num="$1" + local phase_name="$2" + shift 2 + local cmd=("$@") + + local skip_var="SKIP_PHASE_$phase_num" + if [[ "${!skip_var}" == "true" ]]; then + log_phase "Phase $phase_num: $phase_name [SKIPPED]" + return 0 + fi + + log_phase "Phase $phase_num: $phase_name" + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY RUN] Would execute:" + for c in "${cmd[@]}"; do + echo " $c" + done + return 0 + fi + + local start_time + start_time=$(date +%s) + + local exit_code=0 + + # Execute the command(s) + for c in "${cmd[@]}"; do + log_step "Running: $c" + if ! eval "$c"; then + exit_code=1 + if [[ "$CONTINUE_ON_ERROR" == "true" ]]; then + log_warn "Command failed, continuing due to --continue-on-error" + else + log_error "Command failed" + break + fi + fi + done + + local end_time + end_time=$(date +%s) + local duration=$((end_time - start_time)) + PHASE_TIMES[$phase_num]=$duration + + if [[ $exit_code -eq 0 ]]; then + log_success "Phase $phase_num completed in ${duration}s" + else + log_error "Phase $phase_num failed after ${duration}s" + if [[ "$CONTINUE_ON_ERROR" != "true" ]]; then + return 1 + fi + fi + + return $exit_code +} + +# Phase 1: Fetch events +run_phase_1() { + local cmds=() + + # Fetch from prod relay + cmds+=("'$SCRIPT_DIR/01-fetch-events.sh' '$PROD_RELAY' '$OUTPUT_DIR/prod'") + + # Fetch from archive relay + cmds+=("'$SCRIPT_DIR/01-fetch-events.sh' '$ARCHIVE_RELAY' '$OUTPUT_DIR/archive'") + + run_phase 1 "Fetch Events (~30s each)" "${cmds[@]}" +} + +# Phase 2: Git sync check +run_phase_2() { + local cmds=() + + if [[ -n "$PROD_GIT" ]]; then + cmds+=("'$SCRIPT_DIR/10-check-git-sync.sh' '$OUTPUT_DIR/prod/raw/state-events.json' '$PROD_GIT' '$OUTPUT_DIR/prod' --categorize") + else + log_warn "Skipping prod git sync check (no --prod-git provided)" + fi + + if [[ -n "$ARCHIVE_GIT" ]]; then + cmds+=("'$SCRIPT_DIR/10-check-git-sync.sh' '$OUTPUT_DIR/archive/raw/state-events.json' '$ARCHIVE_GIT' '$OUTPUT_DIR/archive' --categorize") + else + log_warn "Skipping archive git sync check (no --archive-git provided)" + fi + + if [[ ${#cmds[@]} -eq 0 ]]; then + log_warn "No git paths provided, skipping Phase 2" + return 0 + fi + + run_phase 2 "Git Sync Check (~20 min each)" "${cmds[@]}" +} + +# Phase 3: Categorize and compare +run_phase_3() { + local cmds=() + + # Check if we have git-sync-status.tsv files (from Phase 2) + # If not, we can't run categorization + local has_prod_sync=false + local has_archive_sync=false + + if [[ -f "$OUTPUT_DIR/prod/git-sync-status.tsv" ]]; then + has_prod_sync=true + fi + + if [[ -f "$OUTPUT_DIR/archive/git-sync-status.tsv" ]]; then + has_archive_sync=true + fi + + # Run categorization if we have sync data but no category files + if [[ "$has_prod_sync" == "true" && ! -f "$OUTPUT_DIR/prod/category1-complete-match.txt" ]]; then + cmds+=("'$SCRIPT_DIR/20-categorize.sh' '$OUTPUT_DIR/prod/git-sync-status.tsv' '$OUTPUT_DIR/prod'") + fi + + if [[ "$has_archive_sync" == "true" && ! -f "$OUTPUT_DIR/archive/category1-complete-match.txt" ]]; then + cmds+=("'$SCRIPT_DIR/20-categorize.sh' '$OUTPUT_DIR/archive/git-sync-status.tsv' '$OUTPUT_DIR/archive'") + fi + + # Run comparison if we have category files + if [[ -f "$OUTPUT_DIR/prod/category1-complete-match.txt" && -f "$OUTPUT_DIR/archive/category1-complete-match.txt" ]]; then + cmds+=("'$SCRIPT_DIR/21-compare-relays.sh' '$OUTPUT_DIR/prod' '$OUTPUT_DIR/archive' '$OUTPUT_DIR/comparison'") + else + log_warn "Missing category files for comparison." + log_warn "Phase 2 must complete successfully before Phase 3 can compare relays." + + # Create placeholder comparison files if they don't exist + if [[ "$DRY_RUN" != "true" ]]; then + mkdir -p "$OUTPUT_DIR/comparison" + for f in complete-in-both.txt complete-prod-missing-archive.txt complete-prod-incomplete-archive.txt incomplete-in-both.txt in-archive-not-prod.txt; do + if [[ ! -f "$OUTPUT_DIR/comparison/$f" ]]; then + echo "# Placeholder - Phase 2 data not available" > "$OUTPUT_DIR/comparison/$f" + fi + done + echo "# Comparison not available - Phase 2 data missing" > "$OUTPUT_DIR/comparison/summary.txt" + fi + fi + + if [[ ${#cmds[@]} -eq 0 ]]; then + log_warn "No categorization or comparison needed (already done or missing input)" + return 0 + fi + + run_phase 3 "Categorize & Compare (fast)" "${cmds[@]}" + + # Phase 3c: Compare git data between relays (requires git paths) + # This determines if archive is ahead of prod for repos with mismatched state + if [[ -n "$PROD_GIT" && -n "$ARCHIVE_GIT" ]]; then + # Build list of repos to compare: those where prod=complete but archive is not + local repos_to_compare="$OUTPUT_DIR/comparison/complete-prod-incomplete-archive.txt" + if [[ -f "$repos_to_compare" ]] && [[ ! -f "$OUTPUT_DIR/comparison/git-ancestry.tsv" ]]; then + log_info "Running git ancestry comparison (Phase 3c)..." + run_phase 3 "Git Ancestry Comparison" "'$SCRIPT_DIR/22-compare-git-data.sh' '$PROD_GIT' '$ARCHIVE_GIT' '$repos_to_compare' '$OUTPUT_DIR/comparison'" + fi + else + log_warn "Git paths not provided - skipping git ancestry comparison" + log_warn "Without git comparison, repos where archive is ahead will be incorrectly flagged as needing re-sync" + fi +} + +# Phase 4: Extract logs +run_phase_4() { + if [[ -z "$SERVICE_NAME" ]]; then + log_warn "No service name provided, skipping Phase 4" + return 0 + fi + + # Validate service name before running Phase 4 + # Structured logging only exists in ngit-grasp, not ngit-relay + if [[ "$SERVICE_NAME" == *"ngit-relay"* ]]; then + log_error "SERVICE_NAME appears to be ngit-relay: $SERVICE_NAME" + log_error "" + log_error "Phase 4 requires an ngit-grasp service with structured logging." + log_error "Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) only exists" + log_error "in ngit-grasp services, NOT in ngit-relay services." + log_error "" + log_error "Please update --service to use the ngit-grasp archive service." + log_error "" + log_error "To find the correct service name:" + log_error " systemctl list-units 'ngit-grasp*' --all" + log_error "" + log_error "Common ngit-grasp service names:" + log_error " - ngit-grasp.service" + log_error " - ngit-grasp-relay-ngit-dev.service (NixOS multi-instance)" + log_error " - ngit-grasp-archive.service" + return 1 + fi + + # Warn if service name doesn't look like ngit-grasp + if [[ "$SERVICE_NAME" != *"ngit-grasp"* && "$SERVICE_NAME" != *"grasp"* ]]; then + log_warn "SERVICE_NAME doesn't contain 'ngit-grasp': $SERVICE_NAME" + log_warn "Structured logging only exists in ngit-grasp services." + log_warn "If this is not an ngit-grasp service, Phase 4 will find no logs." + fi + + local cmds=() + + # Build log extraction options + local log_opts="" + if [[ -n "$LOG_SINCE" ]]; then + log_opts="$log_opts --since '$LOG_SINCE'" + fi + if [[ -n "$LOG_UNTIL" ]]; then + log_opts="$log_opts --until '$LOG_UNTIL'" + fi + + cmds+=("'$SCRIPT_DIR/30-extract-parse-failures.sh' '$SERVICE_NAME' '$OUTPUT_DIR/logs' $log_opts") + cmds+=("'$SCRIPT_DIR/31-extract-purgatory-expiry.sh' '$SERVICE_NAME' '$OUTPUT_DIR/logs' $log_opts") + + run_phase 4 "Extract Logs (VPS required)" "${cmds[@]}" +} + +# Phase 5: Final classification +run_phase_5() { + # Check if we have the minimum required files + local can_run=true + + if [[ ! -d "$OUTPUT_DIR/prod" ]]; then + log_warn "Missing prod directory" + can_run=false + fi + + if [[ ! -d "$OUTPUT_DIR/archive" ]]; then + log_warn "Missing archive directory" + can_run=false + fi + + if [[ ! -d "$OUTPUT_DIR/comparison" ]]; then + log_warn "Missing comparison directory" + can_run=false + fi + + # Create logs directory with empty files if missing + if [[ "$DRY_RUN" != "true" ]]; then + mkdir -p "$OUTPUT_DIR/logs" + for f in parse-failures.txt purgatory-expired.txt; do + if [[ ! -f "$OUTPUT_DIR/logs/$f" ]]; then + echo "# No data - Phase 4 not run" > "$OUTPUT_DIR/logs/$f" + fi + done + fi + + if [[ "$can_run" == "false" ]]; then + log_error "Cannot run Phase 5 - missing required input directories" + return 1 + fi + + run_phase 5 "Final Classification (fast)" "'$SCRIPT_DIR/40-classify-actions.sh' '$OUTPUT_DIR'" +} + +# Display summary +display_summary() { + log_header "Migration Analysis Complete" + + echo "Output Directory: $OUTPUT_DIR" + echo "" + + # Phase timing summary + echo "Phase Timing:" + local total_time=0 + for phase in 1 2 3 4 5; do + local skip_var="SKIP_PHASE_$phase" + if [[ "${!skip_var}" == "true" ]]; then + echo " Phase $phase: SKIPPED" + elif [[ -n "${PHASE_TIMES[$phase]:-}" ]]; then + local t="${PHASE_TIMES[$phase]}" + echo " Phase $phase: ${t}s" + total_time=$((total_time + t)) + else + echo " Phase $phase: N/A" + fi + done + echo " ─────────────" + echo " Total: ${total_time}s" + echo "" + + # Results summary + if [[ -f "$OUTPUT_DIR/results/summary.txt" ]]; then + echo "Results Summary:" + echo "" + # Extract key metrics from summary + if grep -q "No Action Required" "$OUTPUT_DIR/results/summary.txt"; then + grep -A1 "No Action Required" "$OUTPUT_DIR/results/summary.txt" | head -2 + fi + if grep -q "Action Required" "$OUTPUT_DIR/results/summary.txt"; then + grep -A1 "Action Required" "$OUTPUT_DIR/results/summary.txt" | head -2 + fi + if grep -q "Manual Investigation" "$OUTPUT_DIR/results/summary.txt"; then + grep -A1 "Manual Investigation" "$OUTPUT_DIR/results/summary.txt" | head -2 + fi + echo "" + fi + + # Output files + echo "Output Files:" + echo " $OUTPUT_DIR/results/no-action-required.txt" + echo " $OUTPUT_DIR/results/action-required.txt" + echo " $OUTPUT_DIR/results/manual-investigation.txt" + echo " $OUTPUT_DIR/results/summary.txt" + echo "" + + # Next steps + echo "Next Steps:" + echo " 1. Review results/summary.txt for overview" + echo " 2. Address items in results/action-required.txt" + echo " 3. Investigate items in results/manual-investigation.txt" + echo " 4. Plan migration window when action items are resolved" + echo "" +} + +# Main +main() { + parse_args "$@" + + log_header "GRASP Relay to ngit-grasp Migration Analysis" + + validate_args + check_prerequisites + determine_phases + setup_output_dir + + # Show configuration + log_info "Configuration:" + log_step "Prod relay: $PROD_RELAY" + log_step "Archive relay: $ARCHIVE_RELAY" + [[ -n "$PROD_GIT" ]] && log_step "Prod git: $PROD_GIT" + [[ -n "$ARCHIVE_GIT" ]] && log_step "Archive git: $ARCHIVE_GIT" + [[ -n "$SERVICE_NAME" ]] && log_step "Service: $SERVICE_NAME" + log_step "Output: $OUTPUT_DIR" + echo "" + + # Show phase plan + log_info "Phase Plan:" + for phase in 1 2 3 4 5; do + local skip_var="SKIP_PHASE_$phase" + if [[ "${!skip_var}" == "true" ]]; then + log_step "Phase $phase: SKIP" + else + log_step "Phase $phase: RUN" + fi + done + echo "" + + if [[ "$DRY_RUN" == "true" ]]; then + log_warn "DRY RUN MODE - No changes will be made" + echo "" + fi + + # Run phases + local overall_exit=0 + + run_phase_1 || overall_exit=1 + run_phase_2 || overall_exit=1 + run_phase_3 || overall_exit=1 + run_phase_4 || overall_exit=1 + run_phase_5 || overall_exit=1 + + # Display summary + if [[ "$DRY_RUN" != "true" ]]; then + display_summary + fi + + if [[ $overall_exit -ne 0 ]]; then + log_warn "Some phases failed. Review output for details." + fi + + exit $overall_exit +} + +main "$@" diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/validate-service.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/validate-service.sh new file mode 100755 index 0000000..6988af3 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/validate-service.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +# +# validate-service.sh - Validate service name for structured logging +# +# This helper script validates that a service name is appropriate for +# Phase 4 log extraction. Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) +# only exists in ngit-grasp services, NOT in ngit-relay services. +# +# USAGE: +# Source this script and call the validation function: +# +# source validate-service.sh +# validate_service_for_structured_logging "$SERVICE_NAME" || exit 1 +# +# BACKGROUND: +# Phase 4 of the migration analysis extracts structured log entries from +# journald. These log entries only exist in ngit-grasp services. If you +# accidentally specify an ngit-relay service, Phase 4 will find no logs +# and produce empty results. +# +# This validation prevents that common mistake by: +# 1. Checking if the service name contains "ngit-relay" (error) +# 2. Warning if the service name doesn't contain "ngit-grasp" +# 3. Optionally checking if structured logs actually exist +# +# SEE ALSO: +# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide +# 30-extract-parse-failures.sh - Uses this validation +# 31-extract-purgatory-expiry.sh - Uses this validation +# + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + _VS_RED='\033[0;31m' + _VS_YELLOW='\033[0;33m' + _VS_NC='\033[0m' +else + _VS_RED='' + _VS_YELLOW='' + _VS_NC='' +fi + +# Validates that the service name is appropriate for structured logging +# +# Arguments: +# $1 - service_name: The systemd service name to validate +# $2 - check_logs: Whether to check if logs actually exist (default: "true") +# $3 - interactive: Whether to prompt for confirmation (default: "true") +# +# Returns: +# 0 - Service is valid for structured logging +# 1 - Service is invalid or user declined to continue +# +# Example: +# validate_service_for_structured_logging "ngit-grasp.service" || exit 1 +# validate_service_for_structured_logging "ngit-grasp.service" "false" # Skip log check +# validate_service_for_structured_logging "ngit-grasp.service" "true" "false" # Non-interactive +# +validate_service_for_structured_logging() { + local service_name="$1" + local check_logs="${2:-true}" + local interactive="${3:-true}" + + # Check if service name looks like ngit-relay (ERROR - wrong service type) + if [[ "$service_name" == *"ngit-relay"* ]]; then + echo -e "${_VS_RED}ERROR: Service name appears to be ngit-relay: $service_name${_VS_NC}" >&2 + echo "" >&2 + echo "Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) only exists in" >&2 + echo "ngit-grasp services, NOT in ngit-relay services." >&2 + echo "" >&2 + echo "Please use the ngit-grasp archive service instead." >&2 + echo "" >&2 + echo "To find the correct service name:" >&2 + echo " systemctl list-units 'ngit-grasp*' --all" >&2 + echo "" >&2 + echo "Common ngit-grasp service names:" >&2 + echo " - ngit-grasp.service" >&2 + echo " - ngit-grasp-relay-ngit-dev.service (NixOS multi-instance)" >&2 + echo " - ngit-grasp-archive.service" >&2 + return 1 + fi + + # Check if service name looks like ngit-grasp (WARNING if not) + if [[ "$service_name" != *"ngit-grasp"* && "$service_name" != *"grasp"* ]]; then + echo -e "${_VS_YELLOW}WARNING: Service name doesn't contain 'ngit-grasp': $service_name${_VS_NC}" >&2 + echo "" >&2 + echo "Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) only exists in" >&2 + echo "ngit-grasp services." >&2 + echo "" >&2 + + if [[ "$interactive" == "true" ]]; then + read -p "Continue anyway? (y/N) " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + return 1 + fi + else + echo "Non-interactive mode: proceeding despite warning" >&2 + fi + fi + + # Optionally check if structured logs actually exist + if [[ "$check_logs" == "true" ]]; then + # Check if journalctl is available + if ! command -v journalctl &> /dev/null; then + echo -e "${_VS_YELLOW}WARNING: journalctl not available, cannot verify logs exist${_VS_NC}" >&2 + return 0 + fi + + # Check for structured log entries + # IMPORTANT: Use --no-pager to prevent hanging when run non-interactively (e.g., via SSH) + local has_parse_fail has_purgatory + has_parse_fail=$(journalctl --no-pager -u "$service_name" --since "7 days ago" 2>/dev/null | grep -c '\[PARSE_FAIL\]' || echo "0") + has_purgatory=$(journalctl --no-pager -u "$service_name" --since "7 days ago" 2>/dev/null | grep -c '\[PURGATORY_EXPIRED\]' || echo "0") + + # Strip any non-numeric characters (grep -c can have trailing whitespace) + has_parse_fail="${has_parse_fail//[^0-9]/}" + has_purgatory="${has_purgatory//[^0-9]/}" + has_parse_fail="${has_parse_fail:-0}" + has_purgatory="${has_purgatory:-0}" + + if [[ "$has_parse_fail" -eq 0 && "$has_purgatory" -eq 0 ]]; then + echo -e "${_VS_YELLOW}WARNING: No structured logs found in $service_name (last 7 days)${_VS_NC}" >&2 + echo "" >&2 + echo "This may indicate:" >&2 + echo " 1. Wrong service (should be ngit-grasp archive service, not ngit-relay)" >&2 + echo " 2. Structured logging not yet deployed to this ngit-grasp instance" >&2 + echo " 3. No parse failures or purgatory expiry events in the time window" >&2 + echo "" >&2 + echo "To verify you have the right service:" >&2 + echo " systemctl list-units 'ngit-grasp*' --all" >&2 + echo " journalctl -u | grep -E '\\[PARSE_FAIL\\]|\\[PURGATORY_EXPIRED\\]' | head -5" >&2 + echo "" >&2 + + if [[ "$interactive" == "true" ]]; then + read -p "Continue anyway? (y/N) " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + return 1 + fi + else + echo "Non-interactive mode: proceeding despite warning" >&2 + fi + fi + fi + + return 0 +} + +# Export the function so it can be used after sourcing +export -f validate_service_for_structured_logging -- cgit v1.2.3