From 92a9a3bfe0bc522e8ae411991a366a3a6310d525 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Tue, 3 Feb 2026 14:41:46 +0000 Subject: docs: archive relay.ngit.dev migration materials for reference Move migration guide and scripts to docs/archive/2026-01-relay-ngit-dev-migration/ with clear warnings that these are reference-only materials from a specific migration context, not general-purpose tools. These materials document the relay.ngit.dev migration from ngit-relay to ngit-grasp in January 2026. The scripts were developed iteratively during the migration and are specific to that context. They are preserved for: - Historical reference - Context for production fixes in this branch - Inspiration for future migrations (not direct reuse) The migration uncovered critical bugs now fixed in this branch: - Git protocol error handling - Naughty list false positives - Purgatory event tracking - Sync startup issues - Configuration management --- docs/how-to/migration-scripts/01-fetch-events.sh | 206 ------ docs/how-to/migration-scripts/10-check-git-sync.sh | 564 --------------- docs/how-to/migration-scripts/20-categorize.sh | 212 ------ docs/how-to/migration-scripts/21-compare-relays.sh | 294 -------- .../migration-scripts/22-compare-git-data.sh | 390 ----------- .../migration-scripts/30-extract-parse-failures.sh | 774 -------------------- .../31-extract-purgatory-expiry.sh | 408 ----------- .../migration-scripts/40-classify-actions.sh | 662 ----------------- .../migration-scripts/run-migration-analysis.sh | 779 --------------------- docs/how-to/migration-scripts/validate-service.sh | 151 ---- 10 files changed, 4440 deletions(-) delete mode 100755 docs/how-to/migration-scripts/01-fetch-events.sh delete mode 100755 docs/how-to/migration-scripts/10-check-git-sync.sh delete mode 100755 docs/how-to/migration-scripts/20-categorize.sh delete mode 100755 docs/how-to/migration-scripts/21-compare-relays.sh delete mode 100755 docs/how-to/migration-scripts/22-compare-git-data.sh delete mode 100755 docs/how-to/migration-scripts/30-extract-parse-failures.sh delete mode 100755 docs/how-to/migration-scripts/31-extract-purgatory-expiry.sh delete mode 100755 docs/how-to/migration-scripts/40-classify-actions.sh delete mode 100755 docs/how-to/migration-scripts/run-migration-analysis.sh delete mode 100755 docs/how-to/migration-scripts/validate-service.sh (limited to 'docs/how-to/migration-scripts') diff --git a/docs/how-to/migration-scripts/01-fetch-events.sh b/docs/how-to/migration-scripts/01-fetch-events.sh deleted file mode 100755 index e0d6f26..0000000 --- a/docs/how-to/migration-scripts/01-fetch-events.sh +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/env bash -# -# 01-fetch-events.sh - Fetch nostr events from a relay for migration analysis -# -# PHASE 1 of the GRASP relay to ngit-grasp migration analysis pipeline. -# Fetches kind 30618 (state), 30617 (announcement), and 5 (deletion) events. -# -# USAGE: -# ./01-fetch-events.sh -# -# EXAMPLES: -# # Fetch from production relay -# ./01-fetch-events.sh wss://relay.ngit.dev output/prod -# -# # Fetch from archive relay -# ./01-fetch-events.sh wss://archive.relay.ngit.dev output/archive -# -# # Full migration analysis setup -# mkdir -p work/migration-analysis-$(date +%Y%m%d-%H%M) -# ./01-fetch-events.sh wss://relay.ngit.dev work/migration-analysis-*/prod -# ./01-fetch-events.sh wss://archive.relay.ngit.dev work/migration-analysis-*/archive -# -# OUTPUT: -# /raw/state-events.json - kind 30618 events (one per line, JSONL) -# /raw/announcements.json - kind 30617 events (one per line, JSONL) -# /raw/deletions.json - kind 5 events (one per line, JSONL) -# -# OUTPUT FORMAT: -# Each file contains one JSON event per line (JSONL format). -# Events are the raw nostr event objects as returned by the relay. -# -# PREREQUISITES: -# - nak (Nostr Army Knife) - https://github.com/fiatjaf/nak -# - jq (for counting/validation) -# -# RUNTIME: ~30 seconds per relay (depends on network and event count) -# -# NOTES: -# - Uses --paginate to ensure all events are fetched (not just first page) -# - If event counts are exact multiples of 250, pagination may have failed -# - Run Phase 1 and Phase 2 back-to-back for accurate snapshot -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# - -set -euo pipefail - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' # No Color -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 " - echo "" - echo "Arguments:" - echo " relay-url WebSocket URL of the relay (e.g., wss://relay.ngit.dev)" - echo " output-dir Directory to store fetched events (e.g., output/prod)" - echo "" - echo "Examples:" - echo " $0 wss://relay.ngit.dev output/prod" - echo " $0 wss://archive.relay.ngit.dev output/archive" - exit 1 -} - -# Check prerequisites -check_prerequisites() { - local missing=0 - - if ! command -v nak &> /dev/null; then - log_error "nak not found. Install from: https://github.com/fiatjaf/nak" - missing=1 - fi - - if ! command -v jq &> /dev/null; then - log_error "jq not found. Install with your package manager." - missing=1 - fi - - if [[ $missing -eq 1 ]]; then - exit 1 - fi -} - -# Fetch events of a specific kind -# Args: $1=relay, $2=kind, $3=output_file, $4=description -fetch_kind() { - local relay="$1" - local kind="$2" - local output_file="$3" - local description="$4" - - log_info "Fetching $description (kind $kind) from $relay..." - - local start_time - start_time=$(date +%s) - - # Use --paginate to ensure we get all events, not just first page - # nak outputs one event per line (JSONL format) - if ! nak req -k "$kind" --paginate "$relay" > "$output_file" 2>/dev/null; then - log_error "Failed to fetch $description from $relay" - return 1 - fi - - local end_time - end_time=$(date +%s) - local duration=$((end_time - start_time)) - - # Count events - local count - count=$(wc -l < "$output_file" | tr -d ' ') - - # Warn if count is suspicious (exact multiple of 250 suggests pagination issue) - if [[ $count -gt 0 ]] && [[ $((count % 250)) -eq 0 ]]; then - log_warn "$description count ($count) is exact multiple of 250 - pagination may have failed!" - fi - - log_success "Fetched $count $description in ${duration}s -> $output_file" - - echo "$count" -} - -# Main -main() { - if [[ $# -ne 2 ]]; then - usage - fi - - local relay="$1" - local output_dir="$2" - - # Validate relay URL - if [[ ! "$relay" =~ ^wss?:// ]]; then - log_error "Invalid relay URL: $relay (must start with ws:// or wss://)" - exit 1 - fi - - check_prerequisites - - log_info "Starting event fetch from $relay" - log_info "Output directory: $output_dir" - - # Create output directory structure - local raw_dir="$output_dir/raw" - mkdir -p "$raw_dir" - - local total_start - total_start=$(date +%s) - - # Fetch each event type - local state_count announcement_count deletion_count - - state_count=$(fetch_kind "$relay" 30618 "$raw_dir/state-events.json" "state events") - announcement_count=$(fetch_kind "$relay" 30617 "$raw_dir/announcements.json" "announcements") - deletion_count=$(fetch_kind "$relay" 5 "$raw_dir/deletions.json" "deletion requests") - - local total_end - total_end=$(date +%s) - local total_duration=$((total_end - total_start)) - - # Summary - echo "" - log_info "=== Fetch Summary ===" - log_info "Relay: $relay" - log_info "Output: $output_dir" - log_info "State events (30618): $state_count" - log_info "Announcements (30617): $announcement_count" - log_info "Deletions (5): $deletion_count" - log_info "Total time: ${total_duration}s" - echo "" - - # Output file listing for easy copy/paste - log_info "Output files:" - echo " $raw_dir/state-events.json" - echo " $raw_dir/announcements.json" - echo " $raw_dir/deletions.json" -} - -main "$@" diff --git a/docs/how-to/migration-scripts/10-check-git-sync.sh b/docs/how-to/migration-scripts/10-check-git-sync.sh deleted file mode 100755 index b4536cb..0000000 --- a/docs/how-to/migration-scripts/10-check-git-sync.sh +++ /dev/null @@ -1,564 +0,0 @@ -#!/usr/bin/env bash -# -# 10-check-git-sync.sh - Compare state events to actual git data on disk -# -# PHASE 2 of the GRASP relay to ngit-grasp migration analysis pipeline. -# Compares kind 30618 state events against actual git refs on disk. -# -# USAGE: -# ./10-check-git-sync.sh [--categorize] -# -# EXAMPLES: -# # Check source relay against source git data -# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/grasp-relay/git output/prod -# -# # Check target relay against target git data -# ./10-check-git-sync.sh output/archive/raw/state-events.json /var/lib/ngit-grasp/git output/archive -# -# # Check and categorize in one step (convenience mode) -# ./10-check-git-sync.sh output/prod/raw/state-events.json /var/lib/grasp-relay/git output/prod --categorize -# -# INPUT: -# state-events.json - JSONL file from Phase 1 (01-fetch-events.sh) -# One kind 30618 event per line -# git-base-dir - Base directory containing git repos -# Structure: //.git/ -# -# OUTPUT: -# /git-sync-status.tsv - Tab-separated values: -# reponpubstate_refsgit_refsmatchesreason -# -# With --categorize flag, also outputs: -# /category1-complete-match.txt -# /category2-empty-blank.txt -# /category3-partial-match.txt -# /category4-no-match.txt -# -# CATEGORIES: -# 1. Complete Match - All refs in state event match git data perfectly -# 2. Empty/Blank - No git data available (directory missing or empty) -# 3. Partial Match - Some refs match, some don't -# 4. No Match - Git data exists but commit hashes don't match -# -# PREREQUISITES: -# - nak (for npub encoding) - https://github.com/fiatjaf/nak -# - jq (for JSON parsing) -# - Read access to git directories (may need sudo) -# -# RUNTIME: ~20 minutes on VPS (git operations are slow) -# -# NOTES: -# - Must run on VPS with access to git directories -# - Progress indicator updates every 10 events -# - Handles packed refs (git show-ref) and loose refs -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 01-fetch-events.sh - Phase 1 script that produces input for this script -# 20-categorize.sh - Phase 3a script that consumes output from this script -# - -set -euo pipefail - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -log_progress() { - # Overwrite current line for progress updates - echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 [--categorize]" - echo "" - echo "Arguments:" - echo " state-events.json JSONL file from Phase 1 (kind 30618 events)" - echo " git-base-dir Base directory for git repos (e.g., /var/lib/grasp-relay/git)" - echo " output-dir Directory to store output files" - echo " --categorize Optional: also output category files (like Phase 3)" - echo "" - echo "Examples:" - echo " $0 output/prod/raw/state-events.json /var/lib/grasp-relay/git output/prod" - echo " $0 output/archive/raw/state-events.json /var/lib/ngit-grasp/git output/archive" - echo "" - echo "Output:" - echo " git-sync-status.tsv - TSV with: repo, npub, state_refs, git_refs, matches, reason" - exit 1 -} - -# Check prerequisites -check_prerequisites() { - local missing=0 - - if ! command -v git &> /dev/null; then - log_error "git not found. Install with your package manager." - missing=1 - fi - - if ! command -v nak &> /dev/null; then - log_error "nak not found. Install from: https://github.com/fiatjaf/nak" - log_error "Or run: nix-shell -p nak jq --run \"$0 $*\"" - missing=1 - fi - - if ! command -v jq &> /dev/null; then - log_error "jq not found. Install with your package manager." - missing=1 - fi - - if [[ $missing -eq 1 ]]; then - exit 1 - fi -} - -# Convert hex pubkey to npub -# Args: $1=hex_pubkey -# Returns: npub string or empty on error -hex_to_npub() { - local hex="$1" - nak encode npub "$hex" 2>/dev/null || echo "" -} - -# Count refs in state event (only refs/heads/) -# Args: $1=event_json -# Returns: count -count_state_refs() { - local event="$1" - echo "$event" | jq '[.tags[] | select(.[0] | startswith("refs/heads/"))] | length' 2>/dev/null || echo "0" -} - -# Get git refs from disk -# Args: $1=git_dir -# Returns: count of refs/heads/ refs -count_git_refs() { - local git_dir="$1" - - if [[ ! -d "$git_dir" ]]; then - echo "0" - return - fi - - # Try git show-ref first (handles packed refs correctly) - # Note: We capture output separately to avoid pipefail issues - local count - if count=$(git --git-dir="$git_dir" show-ref --heads 2>/dev/null | wc -l); then - echo "$count" | tr -d ' ' - return - fi - - # Fallback: count loose refs (when git is not available or fails) - if [[ -d "$git_dir/refs/heads" ]]; then - find "$git_dir/refs/heads" -type f 2>/dev/null | wc -l | tr -d ' ' - else - echo "0" - fi -} - -# Get ref hash from git directory -# Args: $1=git_dir, $2=ref_path (e.g., refs/heads/main) -# Returns: commit hash or empty -get_git_ref_hash() { - local git_dir="$1" - local ref_path="$2" - - # Try git show-ref first (handles packed refs) - local hash - hash=$(git --git-dir="$git_dir" show-ref --hash "$ref_path" 2>/dev/null | head -1 || echo "") - - if [[ -n "$hash" ]]; then - echo "$hash" - return - fi - - # Fallback: read loose ref file - local ref_file="$git_dir/$ref_path" - if [[ -f "$ref_file" ]]; then - cat "$ref_file" 2>/dev/null | tr -d '\n' || echo "" - else - echo "" - fi -} - -# Compare state event refs to git refs -# Args: $1=event_json, $2=git_dir -# Returns: count of matching refs -count_matching_refs() { - local event="$1" - local git_dir="$2" - local matching=0 - - # Extract refs/heads/ tags and compare - while IFS= read -r ref_tag; do - [[ -z "$ref_tag" ]] && continue - - local ref_path expected_hash - ref_path=$(echo "$ref_tag" | jq -r '.[0]' 2>/dev/null || echo "") - expected_hash=$(echo "$ref_tag" | jq -r '.[1]' 2>/dev/null || echo "") - - # Skip if not a heads ref or hash is missing - [[ ! "$ref_path" =~ ^refs/heads/ ]] && continue - [[ -z "$expected_hash" || "$expected_hash" == "null" ]] && continue - - # Get actual hash from git - local actual_hash - actual_hash=$(get_git_ref_hash "$git_dir" "$ref_path") - - if [[ "$expected_hash" == "$actual_hash" ]]; then - matching=$((matching + 1)) - fi - done < <(echo "$event" | jq -c '.tags[] | select(.[0] | startswith("refs/heads/"))' 2>/dev/null) - - echo "$matching" -} - -# Categorize a single entry -# Args: $1=state_refs, $2=git_refs, $3=matches, $4=reason -# Returns: category number (1-4) -categorize_entry() { - local state_refs="$1" - local git_refs="$2" - local matches="$3" - local reason="$4" - - # Category 2: Empty/Blank - if [[ -n "$reason" ]] || [[ "$git_refs" -eq 0 ]]; then - echo "2" - return - fi - - # Category 1: Complete Match - if [[ "$state_refs" -gt 0 ]] && [[ "$state_refs" -eq "$git_refs" ]] && [[ "$matches" -eq "$state_refs" ]]; then - echo "1" - return - fi - - # Category 4: No Match - if [[ "$git_refs" -gt 0 ]] && [[ "$matches" -eq 0 ]]; then - echo "4" - return - fi - - # Category 3: Partial Match (default for anything else with matches > 0) - if [[ "$matches" -gt 0 ]]; then - echo "3" - return - fi - - # Fallback to category 2 - echo "2" -} - -# Format entry for category file -# Args: $1=repo, $2=npub, $3=state_refs, $4=git_refs, $5=matches, $6=reason -format_category_line() { - local repo="$1" - local npub="$2" - local state_refs="$3" - local git_refs="$4" - local matches="$5" - local reason="$6" - - if [[ -n "$reason" ]]; then - echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches | reason=$reason" - else - echo "$repo | $npub | state_refs=$state_refs | git_refs=$git_refs | matches=$matches" - fi -} - -# Process a single state event -# Args: $1=event_json, $2=git_base -# Outputs: TSV line to stdout -process_event() { - local event="$1" - local git_base="$2" - - # Extract repository identifier (d tag) - local identifier - identifier=$(echo "$event" | jq -r '.tags[] | select(.[0] == "d") | .[1]' 2>/dev/null | head -1 || echo "") - - if [[ -z "$identifier" ]]; then - return 1 - fi - - # Extract maintainer pubkey (hex) - local hex_pubkey - hex_pubkey=$(echo "$event" | jq -r '.pubkey' 2>/dev/null || echo "") - - if [[ -z "$hex_pubkey" ]]; then - return 1 - fi - - # Convert to npub - local npub - npub=$(hex_to_npub "$hex_pubkey") - - if [[ -z "$npub" ]]; then - return 1 - fi - - # Count state refs - local state_refs - state_refs=$(count_state_refs "$event") - - # Find git directory - local git_dir="$git_base/${npub}/${identifier}.git" - - # Check git directory status - local git_refs=0 - local matches=0 - local reason="" - - if [[ ! -d "$git_dir" ]]; then - reason="no_git_dir" - elif [[ ! -d "$git_dir/refs/heads" ]] && [[ ! -f "$git_dir/packed-refs" ]]; then - reason="empty_refs" - else - git_refs=$(count_git_refs "$git_dir") - - if [[ "$git_refs" -eq 0 ]]; then - reason="empty_refs" - elif [[ "$state_refs" -eq 0 ]]; then - reason="no_state_refs" - else - matches=$(count_matching_refs "$event" "$git_dir") - fi - fi - - # Output TSV line: repo, npub, state_refs, git_refs, matches, reason - printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$identifier" "$npub" "$state_refs" "$git_refs" "$matches" "$reason" -} - -# Main -main() { - local do_categorize=0 - local args=() - - # Parse arguments - for arg in "$@"; do - if [[ "$arg" == "--categorize" ]]; then - do_categorize=1 - else - args+=("$arg") - fi - done - - if [[ ${#args[@]} -ne 3 ]]; then - usage - fi - - local state_events_file="${args[0]}" - local git_base="${args[1]}" - local output_dir="${args[2]}" - - # Validate inputs - if [[ ! -f "$state_events_file" ]]; then - log_error "State events file not found: $state_events_file" - exit 1 - fi - - if [[ ! -d "$git_base" ]]; then - log_error "Git base directory not found: $git_base" - log_error "This script must run on the VPS with access to git directories." - exit 1 - fi - - # Check read permissions - if ! ls "$git_base" >/dev/null 2>&1; then - log_error "Cannot read git base directory (permission denied): $git_base" - log_error "Try running with sudo or grant read permissions." - exit 1 - fi - - check_prerequisites - - log_info "=== Git State Synchronization Check ===" - log_info "State events: $state_events_file" - log_info "Git base: $git_base" - log_info "Output: $output_dir" - if [[ $do_categorize -eq 1 ]]; then - log_info "Mode: TSV + categorization" - else - log_info "Mode: TSV only (use 20-categorize.sh for categories)" - fi - log_info "Started: $(date)" - echo "" - - # Create output directory - mkdir -p "$output_dir" - - # Output files - local tsv_file="$output_dir/git-sync-status.tsv" - - # Initialize TSV with header - echo -e "repo\tnpub\tstate_refs\tgit_refs\tmatches\treason" > "$tsv_file" - - # Initialize category files if categorizing - local cat1="" cat2="" cat3="" cat4="" - if [[ $do_categorize -eq 1 ]]; then - cat1="$output_dir/category1-complete-match.txt" - cat2="$output_dir/category2-empty-blank.txt" - cat3="$output_dir/category3-partial-match.txt" - cat4="$output_dir/category4-no-match.txt" - > "$cat1" - > "$cat2" - > "$cat3" - > "$cat4" - fi - - # Count total events - local total_events - total_events=$(wc -l < "$state_events_file" | tr -d ' ') - log_info "Processing $total_events state events..." - echo "" - - # Process each event - local count=0 - local processed=0 - local skipped=0 - local count_cat1=0 count_cat2=0 count_cat3=0 count_cat4=0 - local start_time - start_time=$(date +%s) - - while IFS= read -r event; do - count=$((count + 1)) - - # Skip empty lines - [[ -z "$event" ]] && continue - - # Process event - local result - if result=$(process_event "$event" "$git_base"); then - processed=$((processed + 1)) - - # Write to TSV (skip header line) - echo "$result" >> "$tsv_file" - - # Categorize if requested - if [[ $do_categorize -eq 1 ]]; then - # Parse result - IFS=$'\t' read -r repo npub state_refs git_refs matches reason <<< "$result" - - local category - category=$(categorize_entry "$state_refs" "$git_refs" "$matches" "$reason") - - local cat_line - cat_line=$(format_category_line "$repo" "$npub" "$state_refs" "$git_refs" "$matches" "$reason") - - case "$category" in - 1) echo "$cat_line" >> "$cat1"; count_cat1=$((count_cat1 + 1)) ;; - 2) echo "$cat_line" >> "$cat2"; count_cat2=$((count_cat2 + 1)) ;; - 3) echo "$cat_line" >> "$cat3"; count_cat3=$((count_cat3 + 1)) ;; - 4) echo "$cat_line" >> "$cat4"; count_cat4=$((count_cat4 + 1)) ;; - esac - fi - else - skipped=$((skipped + 1)) - fi - - # Progress indicator every 10 events - if [[ $((count % 10)) -eq 0 ]]; then - local elapsed=$(($(date +%s) - start_time)) - local rate=0 - if [[ $elapsed -gt 0 ]]; then - rate=$((count / elapsed)) - fi - local eta="?" - if [[ $rate -gt 0 ]]; then - eta=$(( (total_events - count) / rate )) - fi - log_progress "Processed $count/$total_events events (~${rate}/s, ETA: ${eta}s)..." - fi - done < "$state_events_file" - - # Clear progress line - echo "" >&2 - - local end_time - end_time=$(date +%s) - local duration=$((end_time - start_time)) - - # Summary - echo "" - log_info "=== Analysis Complete ===" - log_info "Finished: $(date)" - log_info "Duration: ${duration}s" - log_info "Processed: $processed events" - if [[ $skipped -gt 0 ]]; then - log_warn "Skipped: $skipped events (missing identifier or pubkey)" - fi - echo "" - - if [[ $do_categorize -eq 1 ]]; then - # Calculate percentages - local total=$((count_cat1 + count_cat2 + count_cat3 + count_cat4)) - local pct1=0 pct2=0 pct3=0 pct4=0 - if [[ $total -gt 0 ]]; then - pct1=$(awk "BEGIN {printf \"%.1f\", ($count_cat1/$total)*100}") - pct2=$(awk "BEGIN {printf \"%.1f\", ($count_cat2/$total)*100}") - pct3=$(awk "BEGIN {printf \"%.1f\", ($count_cat3/$total)*100}") - pct4=$(awk "BEGIN {printf \"%.1f\", ($count_cat4/$total)*100}") - fi - - log_info "=== Category Summary ===" - log_success "Category 1 (Complete Match): $count_cat1 ($pct1%)" - log_warn "Category 2 (Empty/Blank): $count_cat2 ($pct2%)" - log_warn "Category 3 (Partial Match): $count_cat3 ($pct3%)" - log_error "Category 4 (No Match): $count_cat4 ($pct4%)" - echo "" - - # Validation warning - if [[ $count_cat2 -eq $total ]] && [[ $total -gt 0 ]]; then - log_error "WARNING: 100% of repos categorized as Empty/Blank" - log_error "This usually indicates a permission or path issue." - echo "" - log_info "Troubleshooting:" - echo " 1. Verify git data exists: sudo ls -la $git_base | head -10" - echo " 2. Check sample repo: sudo find $git_base -name '*.git' -type d | head -1" - echo " 3. Re-run with sudo if not already using it" - echo "" - fi - fi - - log_info "Output files:" - echo " $tsv_file" - if [[ $do_categorize -eq 1 ]]; then - echo " $cat1" - echo " $cat2" - echo " $cat3" - echo " $cat4" - else - echo "" - log_info "Next step: Run 20-categorize.sh to categorize results" - echo " ./20-categorize.sh $tsv_file $output_dir" - fi -} - -main "$@" diff --git a/docs/how-to/migration-scripts/20-categorize.sh b/docs/how-to/migration-scripts/20-categorize.sh deleted file mode 100755 index b38dc00..0000000 --- a/docs/how-to/migration-scripts/20-categorize.sh +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env bash -# -# 20-categorize.sh - Categorize git sync status into 4 categories -# -# PHASE 3a of the GRASP relay to ngit-grasp migration analysis pipeline. -# Takes git-sync-status.tsv from Phase 2 and categorizes into 4 files. -# -# USAGE: -# ./20-categorize.sh -# -# EXAMPLES: -# ./20-categorize.sh output/prod/git-sync-status.tsv output/prod -# ./20-categorize.sh output/archive/git-sync-status.tsv output/archive -# -# INPUT FORMAT (git-sync-status.tsv): -# Tab-separated values with columns: -# reponpubstate_refsgit_refsmatchesreason -# -# Where reason is optional and can be: no_git_dir, empty_refs, no_state_refs -# -# OUTPUT: -# /category1-complete-match.txt - All refs match perfectly -# /category2-empty-blank.txt - No git data available -# /category3-partial-match.txt - Some refs match -# /category4-no-match.txt - Git exists but refs don't match -# -# OUTPUT FORMAT: -# repo | npub | state_refs=N | git_refs=N | matches=N [| reason=X] -# -# CATEGORIES: -# 1. Complete Match: state_refs == git_refs == matches (all > 0) -# 2. Empty/Blank: git_refs == 0 OR reason in (no_git_dir, empty_refs, no_state_refs) -# 3. Partial Match: matches > 0 AND matches < state_refs -# 4. No Match: git_refs > 0 AND matches == 0 -# -# PREREQUISITES: -# - awk (standard Unix tool) -# -# RUNTIME: < 1 second (local processing only) -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 10-check-git-sync.sh - Phase 2 script that produces input for this script -# - -set -euo pipefail - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 " - echo "" - echo "Arguments:" - echo " git-sync-status.tsv TSV file from Phase 2 (10-check-git-sync.sh)" - echo " output-dir Directory to store categorized output" - echo "" - echo "Examples:" - echo " $0 output/prod/git-sync-status.tsv output/prod" - echo " $0 output/archive/git-sync-status.tsv output/archive" - echo "" - echo "Input format (TSV):" - echo " reponpubstate_refsgit_refsmatchesreason" - echo "" - echo "Output files:" - echo " category1-complete-match.txt - All refs match" - echo " category2-empty-blank.txt - No git data" - echo " category3-partial-match.txt - Some refs match" - echo " category4-no-match.txt - Git exists, refs don't match" - exit 1 -} - -# Main -main() { - if [[ $# -ne 2 ]]; then - usage - fi - - local input_file="$1" - local output_dir="$2" - - # Validate input file - if [[ ! -f "$input_file" ]]; then - log_error "Input file not found: $input_file" - exit 1 - fi - - log_info "Categorizing git sync status" - log_info "Input: $input_file" - log_info "Output: $output_dir" - - # Create output directory - mkdir -p "$output_dir" - - # Output files - local cat1="$output_dir/category1-complete-match.txt" - local cat2="$output_dir/category2-empty-blank.txt" - local cat3="$output_dir/category3-partial-match.txt" - local cat4="$output_dir/category4-no-match.txt" - - # Clear previous results - > "$cat1" - > "$cat2" - > "$cat3" - > "$cat4" - - # Process input file with awk - # Input: reponpubstate_refsgit_refsmatchesreason - awk -F'\t' -v cat1="$cat1" -v cat2="$cat2" -v cat3="$cat3" -v cat4="$cat4" ' - BEGIN { - count1 = 0; count2 = 0; count3 = 0; count4 = 0 - } - NR == 1 && /^repo/ { next } # Skip header if present - NF >= 5 { - repo = $1 - npub = $2 - state_refs = int($3) - git_refs = int($4) - matches = int($5) - reason = (NF >= 6) ? $6 : "" - - # Format output line - if (reason != "") { - line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches " | reason=" reason - } else { - line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches - } - - # Categorize - if (reason == "no_git_dir" || reason == "empty_refs" || reason == "no_state_refs" || git_refs == 0) { - # Category 2: Empty/Blank - print line >> cat2 - count2++ - } else if (state_refs > 0 && state_refs == git_refs && matches == state_refs) { - # Category 1: Complete Match - print line >> cat1 - count1++ - } else if (matches > 0 && matches < state_refs) { - # Category 3: Partial Match - print line >> cat3 - count3++ - } else if (git_refs > 0 && matches == 0) { - # Category 4: No Match - print line >> cat4 - count4++ - } else if (matches > 0) { - # Edge case: matches > 0 but does not fit other categories - # This can happen when git_refs > state_refs but all state refs match - # Treat as partial match - print line >> cat3 - count3++ - } else { - # Fallback: treat as category 2 (empty/blank) - print line >> cat2 - count2++ - } - } - END { - total = count1 + count2 + count3 + count4 - print "COUNTS:" count1 ":" count2 ":" count3 ":" count4 ":" total - } - ' "$input_file" 2>&1 | while IFS= read -r line; do - if [[ "$line" =~ ^COUNTS: ]]; then - # Parse counts from awk output - IFS=':' read -r _ c1 c2 c3 c4 total <<< "$line" - - echo "" - log_info "=== Categorization Summary ===" - log_info "Total entries: $total" - log_success "Category 1 (Complete Match): $c1" - log_warn "Category 2 (Empty/Blank): $c2" - log_warn "Category 3 (Partial Match): $c3" - log_error "Category 4 (No Match): $c4" - echo "" - log_info "Output files:" - echo " $cat1" - echo " $cat2" - echo " $cat3" - echo " $cat4" - fi - done -} - -main "$@" diff --git a/docs/how-to/migration-scripts/21-compare-relays.sh b/docs/how-to/migration-scripts/21-compare-relays.sh deleted file mode 100755 index b9c0d30..0000000 --- a/docs/how-to/migration-scripts/21-compare-relays.sh +++ /dev/null @@ -1,294 +0,0 @@ -#!/usr/bin/env bash -# -# 21-compare-relays.sh - Compare prod vs archive category files to find gaps -# -# PHASE 3b of the GRASP relay to ngit-grasp migration analysis pipeline. -# Compares categorized output from prod and archive to identify: -# - Repos complete in prod but missing/incomplete in archive -# - Repos in archive but not in prod -# - Status differences between relays -# -# USAGE: -# ./21-compare-relays.sh -# -# EXAMPLES: -# ./21-compare-relays.sh output/prod output/archive output/comparison -# -# INPUT: -# Both prod-dir and archive-dir must contain: -# - category1-complete-match.txt -# - category2-empty-blank.txt -# - category3-partial-match.txt -# - category4-no-match.txt -# -# OUTPUT: -# /complete-in-both.txt - Repos complete in both relays (no action) -# /complete-prod-missing-archive.txt - Complete in prod, not in archive cat1 -# /complete-prod-incomplete-archive.txt - Complete in prod, incomplete in archive -# /incomplete-in-both.txt - Incomplete in both relays -# /in-archive-not-prod.txt - In archive but not in prod -# /summary.txt - Human-readable summary -# -# OUTPUT FORMAT: -# Each file contains lines in the format: -# repo | npub | prod_status | archive_status -# -# PREREQUISITES: -# - awk, sort, comm (standard Unix tools) -# -# RUNTIME: < 1 second (local processing only) -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 20-categorize.sh - Phase 3a script that produces input for this script -# - -set -euo pipefail - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 " - echo "" - echo "Arguments:" - echo " prod-dir Directory containing prod category files" - echo " archive-dir Directory containing archive category files" - echo " output-dir Directory to store comparison results" - echo "" - echo "Examples:" - echo " $0 output/prod output/archive output/comparison" - echo "" - echo "Required input files in each directory:" - echo " category1-complete-match.txt" - echo " category2-empty-blank.txt" - echo " category3-partial-match.txt" - echo " category4-no-match.txt" - exit 1 -} - -# Extract repo|npub key from category line -# Input: "repo | npub | state_refs=N | ..." -# Output: "repo|npub" -extract_key() { - awk -F' \\| ' '{print $1 "|" $2}' -} - -# Build lookup table from category files -# Args: $1=directory, $2=output_file -build_lookup() { - local dir="$1" - local output="$2" - - # Process all 4 category files - for cat in 1 2 3 4; do - local file="$dir/category${cat}-*.txt" - # shellcheck disable=SC2086 - if ls $file 1>/dev/null 2>&1; then - # shellcheck disable=SC2086 - cat $file | while IFS= read -r line; do - key=$(echo "$line" | extract_key) - echo "${key}|cat${cat}|${line}" - done - fi - done | sort -t'|' -k1,2 > "$output" -} - -# Main -main() { - if [[ $# -ne 3 ]]; then - usage - fi - - local prod_dir="$1" - local archive_dir="$2" - local output_dir="$3" - - # Validate input directories - for dir in "$prod_dir" "$archive_dir"; do - if [[ ! -d "$dir" ]]; then - log_error "Directory not found: $dir" - exit 1 - fi - if [[ ! -f "$dir/category1-complete-match.txt" ]]; then - log_error "Missing category1-complete-match.txt in $dir" - exit 1 - fi - done - - log_info "Comparing relay categories" - log_info "Prod: $prod_dir" - log_info "Archive: $archive_dir" - log_info "Output: $output_dir" - - # Create output directory - mkdir -p "$output_dir" - - # Create temp files for processing - local tmp_dir - tmp_dir=$(mktemp -d) - # shellcheck disable=SC2064 - trap "rm -rf '$tmp_dir'" EXIT - - log_info "Building lookup tables..." - - # Build lookup tables: key|category|full_line - build_lookup "$prod_dir" "$tmp_dir/prod_lookup.txt" - build_lookup "$archive_dir" "$tmp_dir/archive_lookup.txt" - - # Extract just keys for comparison - cut -d'|' -f1,2 "$tmp_dir/prod_lookup.txt" | sort -u > "$tmp_dir/prod_keys.txt" - cut -d'|' -f1,2 "$tmp_dir/archive_lookup.txt" | sort -u > "$tmp_dir/archive_keys.txt" - - log_info "Comparing categories..." - - # Initialize output files - > "$output_dir/complete-in-both.txt" - > "$output_dir/complete-prod-missing-archive.txt" - > "$output_dir/complete-prod-incomplete-archive.txt" - > "$output_dir/incomplete-in-both.txt" - > "$output_dir/in-archive-not-prod.txt" - - # Process prod category 1 (complete) entries - while IFS='|' read -r repo npub cat full_line; do - key="${repo}|${npub}" - - # Look up in archive - archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") - - if [[ -z "$archive_entry" ]]; then - # Not in archive at all - echo "$repo | $npub | prod=complete | archive=missing" >> "$output_dir/complete-prod-missing-archive.txt" - else - archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) - if [[ "$archive_cat" == "cat1" ]]; then - # Complete in both - echo "$repo | $npub | prod=complete | archive=complete" >> "$output_dir/complete-in-both.txt" - else - # Complete in prod, incomplete in archive - echo "$repo | $npub | prod=complete | archive=$archive_cat" >> "$output_dir/complete-prod-incomplete-archive.txt" - fi - fi - done < <(grep '|cat1|' "$tmp_dir/prod_lookup.txt" | sed 's/|cat1|/|cat1|/') - - # Process prod categories 2-4 (incomplete) entries - for cat in cat2 cat3 cat4; do - while IFS='|' read -r repo npub _ full_line; do - key="${repo}|${npub}" - - # Look up in archive - archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") - - if [[ -z "$archive_entry" ]]; then - # Incomplete in prod, missing in archive - echo "$repo | $npub | prod=$cat | archive=missing" >> "$output_dir/incomplete-in-both.txt" - else - archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) - if [[ "$archive_cat" != "cat1" ]]; then - # Incomplete in both - echo "$repo | $npub | prod=$cat | archive=$archive_cat" >> "$output_dir/incomplete-in-both.txt" - fi - # If archive is complete but prod is not, that's unusual but not an error - fi - done < <(grep "|${cat}|" "$tmp_dir/prod_lookup.txt") - done - - # Find entries in archive but not in prod - comm -23 "$tmp_dir/archive_keys.txt" "$tmp_dir/prod_keys.txt" | while IFS='|' read -r repo npub; do - key="${repo}|${npub}" - archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") - archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) - echo "$repo | $npub | prod=missing | archive=$archive_cat" >> "$output_dir/in-archive-not-prod.txt" - done - - # Count results - local count_both count_missing count_incomplete count_both_incomplete count_archive_only - count_both=$(wc -l < "$output_dir/complete-in-both.txt" | tr -d ' ') - count_missing=$(wc -l < "$output_dir/complete-prod-missing-archive.txt" | tr -d ' ') - count_incomplete=$(wc -l < "$output_dir/complete-prod-incomplete-archive.txt" | tr -d ' ') - count_both_incomplete=$(wc -l < "$output_dir/incomplete-in-both.txt" | tr -d ' ') - count_archive_only=$(wc -l < "$output_dir/in-archive-not-prod.txt" | tr -d ' ') - - # Generate summary - cat > "$output_dir/summary.txt" << EOF -# Relay Comparison Summary -Generated: $(date -Iseconds) - -## Input -- Prod: $prod_dir -- Archive: $archive_dir - -## Results - -### No Action Required -- Complete in both relays: $count_both - -### Action/Decision Required -- Complete in prod, MISSING from archive: $count_missing -- Complete in prod, INCOMPLETE in archive: $count_incomplete -- Incomplete in BOTH relays: $count_both_incomplete - -### For Reference -- In archive but not in prod: $count_archive_only - -## Files -- complete-in-both.txt: Repos successfully migrated (no action) -- complete-prod-missing-archive.txt: Need investigation - why not in archive? -- complete-prod-incomplete-archive.txt: Archive sync may still be in progress -- incomplete-in-both.txt: Git data incomplete on both relays -- in-archive-not-prod.txt: May be deleted from prod or new to archive - -## Next Steps -1. Review complete-prod-missing-archive.txt - these repos need attention -2. Check if archive sync is still running for incomplete entries -3. Cross-reference with deletion events (kind 5) from Phase 1 -4. Use Phase 4 logs to understand parse failures and purgatory expiry -EOF - - # Display summary - echo "" - log_info "=== Comparison Summary ===" - log_success "Complete in both: $count_both (no action needed)" - log_error "Complete in prod, MISSING from archive: $count_missing" - log_warn "Complete in prod, incomplete in archive: $count_incomplete" - log_warn "Incomplete in both: $count_both_incomplete" - log_info "In archive only: $count_archive_only" - echo "" - log_info "Output files:" - echo " $output_dir/complete-in-both.txt" - echo " $output_dir/complete-prod-missing-archive.txt" - echo " $output_dir/complete-prod-incomplete-archive.txt" - echo " $output_dir/incomplete-in-both.txt" - echo " $output_dir/in-archive-not-prod.txt" - echo " $output_dir/summary.txt" -} - -main "$@" diff --git a/docs/how-to/migration-scripts/22-compare-git-data.sh b/docs/how-to/migration-scripts/22-compare-git-data.sh deleted file mode 100755 index 76521d4..0000000 --- a/docs/how-to/migration-scripts/22-compare-git-data.sh +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env bash -# -# 22-compare-git-data.sh - Compare actual git data between prod and archive relays -# -# PHASE 3c of the GRASP relay to ngit-grasp migration analysis pipeline. -# Compares actual git commits between prod and archive to determine which is ahead. -# -# KEY INSIGHT: -# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event. -# If archive has different/newer data than prod, it means: -# - A state event authorized those commits at some point -# - Archive is actually MORE up-to-date than prod -# - Migration should use archive data (it's already correct) -# -# USAGE: -# ./22-compare-git-data.sh -# -# EXAMPLES: -# ./22-compare-git-data.sh /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \ -# output/comparison/complete-prod-incomplete-archive.txt output/comparison -# -# INPUT: -# prod-git-base Base directory for prod git repos (e.g., /var/lib/grasp-relay/git) -# archive-git-base Base directory for archive git repos (e.g., /var/lib/ngit-grasp/git) -# repo-list File with repos to compare (format: "repo | npub | ...") -# -# OUTPUT: -# /git-ancestry.tsv - Tab-separated values: -# reponpubrelationshipdetails -# -# Relationship values: -# archive-ahead - Archive has all prod commits plus more (GOOD - use archive) -# in-sync - Both have identical commits -# prod-ahead - Prod has commits archive is missing (needs re-sync) -# diverged - Both have unique commits (manual review) -# archive-only - Only archive has git data -# prod-only - Only prod has git data -# both-empty - Neither has git data -# -# PREREQUISITES: -# - git (for ref comparison) -# - Read access to both git directories (may need sudo) -# -# RUNTIME: Depends on number of repos to compare -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 21-compare-relays.sh - Phase 3b script that identifies repos to compare -# - -set -euo pipefail - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -log_progress() { - echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 " - echo "" - echo "Arguments:" - echo " prod-git-base Base directory for prod git repos" - echo " archive-git-base Base directory for archive git repos" - echo " repo-list File with repos to compare (format: 'repo | npub | ...')" - echo " output-dir Directory to store output files" - echo "" - echo "Examples:" - echo " $0 /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \\" - echo " output/comparison/complete-prod-incomplete-archive.txt output/comparison" - echo "" - echo "Output:" - echo " git-ancestry.tsv - TSV with: repo, npub, relationship, details" - exit 1 -} - -# Get all branch refs from a git directory -# Args: $1=git_dir -# Returns: sorted list of "ref_name commit_hash" lines -get_git_refs() { - local git_dir="$1" - - if [[ ! -d "$git_dir" ]]; then - return - fi - - git --git-dir="$git_dir" show-ref --heads 2>/dev/null | sort || true -} - -# Check if commit A is ancestor of commit B -# Args: $1=git_dir, $2=commit_a, $3=commit_b -# Returns: 0 if A is ancestor of B, 1 otherwise -is_ancestor() { - local git_dir="$1" - local commit_a="$2" - local commit_b="$3" - - git --git-dir="$git_dir" merge-base --is-ancestor "$commit_a" "$commit_b" 2>/dev/null -} - -# Compare git data between prod and archive for a single repo -# Args: $1=prod_git_dir, $2=archive_git_dir -# Returns: relationship string -compare_repo_git() { - local prod_git="$1" - local archive_git="$2" - - local prod_exists=false - local archive_exists=false - - [[ -d "$prod_git" ]] && prod_exists=true - [[ -d "$archive_git" ]] && archive_exists=true - - # Handle cases where one or both don't exist - if [[ "$prod_exists" == "false" && "$archive_exists" == "false" ]]; then - echo "both-empty" - return - fi - - if [[ "$prod_exists" == "false" ]]; then - echo "archive-only" - return - fi - - if [[ "$archive_exists" == "false" ]]; then - echo "prod-only" - return - fi - - # Both exist - get refs - local prod_refs archive_refs - prod_refs=$(get_git_refs "$prod_git") - archive_refs=$(get_git_refs "$archive_git") - - # Handle empty refs - if [[ -z "$prod_refs" && -z "$archive_refs" ]]; then - echo "both-empty" - return - fi - - if [[ -z "$prod_refs" ]]; then - echo "archive-only" - return - fi - - if [[ -z "$archive_refs" ]]; then - echo "prod-only" - return - fi - - # Compare refs - check if they're identical - if [[ "$prod_refs" == "$archive_refs" ]]; then - echo "in-sync" - return - fi - - # Refs differ - need to check ancestry - # Strategy: For each branch, check if one is ancestor of the other - # If all archive branches are ahead of or equal to prod branches, archive is ahead - # If all prod branches are ahead of or equal to archive branches, prod is ahead - # Otherwise, they've diverged - - local archive_ahead=true - local prod_ahead=true - local has_common_branch=false - - # Create temporary file to use archive as reference repo for ancestry checks - # We need a repo that has both sets of commits to check ancestry - # Use archive since it's the target and should have the superset - - # Check each prod branch against archive - while read -r prod_hash prod_ref; do - [[ -z "$prod_hash" ]] && continue - - # Get the same branch from archive - local archive_hash - archive_hash=$(echo "$archive_refs" | grep " $prod_ref$" | awk '{print $1}' || echo "") - - if [[ -z "$archive_hash" ]]; then - # Branch exists in prod but not archive - prod has something archive doesn't - # But this could be a deleted branch, so don't immediately say prod is ahead - continue - fi - - has_common_branch=true - - if [[ "$prod_hash" == "$archive_hash" ]]; then - # Same commit - neither ahead for this branch - continue - fi - - # Different commits - check ancestry - # First, try to check if prod is ancestor of archive (archive ahead) - if is_ancestor "$archive_git" "$prod_hash" "$archive_hash" 2>/dev/null; then - # Prod commit is ancestor of archive commit - archive is ahead for this branch - prod_ahead=false - elif is_ancestor "$archive_git" "$archive_hash" "$prod_hash" 2>/dev/null; then - # Archive commit is ancestor of prod commit - prod is ahead for this branch - archive_ahead=false - else - # Neither is ancestor - diverged - archive_ahead=false - prod_ahead=false - fi - done <<< "$prod_refs" - - # Also check for branches only in archive (archive has extra branches) - while read -r archive_hash archive_ref; do - [[ -z "$archive_hash" ]] && continue - - local prod_hash - prod_hash=$(echo "$prod_refs" | grep " $archive_ref$" | awk '{print $1}' || echo "") - - if [[ -z "$prod_hash" ]]; then - # Branch exists in archive but not prod - archive has something prod doesn't - # This means archive is ahead (has extra branches) - prod_ahead=false - fi - done <<< "$archive_refs" - - # Determine final relationship - if [[ "$has_common_branch" == "false" ]]; then - # No common branches - completely different - echo "diverged" - return - fi - - if [[ "$archive_ahead" == "true" && "$prod_ahead" == "false" ]]; then - echo "archive-ahead" - elif [[ "$prod_ahead" == "true" && "$archive_ahead" == "false" ]]; then - echo "prod-ahead" - elif [[ "$archive_ahead" == "true" && "$prod_ahead" == "true" ]]; then - # Both true means all common branches are identical - # But one might have extra branches - echo "in-sync" - else - echo "diverged" - fi -} - -# Main -main() { - if [[ $# -ne 4 ]]; then - usage - fi - - local prod_git_base="$1" - local archive_git_base="$2" - local repo_list="$3" - local output_dir="$4" - - # Validate inputs - if [[ ! -d "$prod_git_base" ]]; then - log_error "Prod git base directory not found: $prod_git_base" - exit 1 - fi - - if [[ ! -d "$archive_git_base" ]]; then - log_error "Archive git base directory not found: $archive_git_base" - exit 1 - fi - - if [[ ! -f "$repo_list" ]]; then - log_error "Repo list file not found: $repo_list" - exit 1 - fi - - log_info "=== Git Data Comparison ===" - log_info "Prod git base: $prod_git_base" - log_info "Archive git base: $archive_git_base" - log_info "Repo list: $repo_list" - log_info "Output: $output_dir" - log_info "Started: $(date)" - echo "" - - # Create output directory - mkdir -p "$output_dir" - - # Output file - local tsv_file="$output_dir/git-ancestry.tsv" - - # Initialize TSV with header - echo -e "repo\tnpub\trelationship\tdetails" > "$tsv_file" - - # Count repos - local total_repos - total_repos=$(grep -c -v '^#' "$repo_list" 2>/dev/null || echo "0") - log_info "Processing $total_repos repos..." - echo "" - - # Counters - local count=0 - local count_archive_ahead=0 - local count_in_sync=0 - local count_prod_ahead=0 - local count_diverged=0 - local count_archive_only=0 - local count_prod_only=0 - local count_both_empty=0 - - # Process each repo - while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do - # Skip comments and empty lines - [[ "$repo" =~ ^# ]] && continue - [[ -z "$repo" ]] && continue - - # Clean up whitespace - repo="${repo// /}" - npub="${npub// /}" - - [[ -z "$repo" || -z "$npub" ]] && continue - - count=$((count + 1)) - - # Build git paths - local prod_git="$prod_git_base/${npub}/${repo}.git" - local archive_git="$archive_git_base/${npub}/${repo}.git" - - # Compare - local relationship details="" - relationship=$(compare_repo_git "$prod_git" "$archive_git") - - # Count by relationship - case "$relationship" in - archive-ahead) count_archive_ahead=$((count_archive_ahead + 1)) ;; - in-sync) count_in_sync=$((count_in_sync + 1)) ;; - prod-ahead) count_prod_ahead=$((count_prod_ahead + 1)) ;; - diverged) count_diverged=$((count_diverged + 1)) ;; - archive-only) count_archive_only=$((count_archive_only + 1)) ;; - prod-only) count_prod_only=$((count_prod_only + 1)) ;; - both-empty) count_both_empty=$((count_both_empty + 1)) ;; - esac - - # Output TSV line - printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$relationship" "$details" >> "$tsv_file" - - # Progress indicator every 10 repos - if [[ $((count % 10)) -eq 0 ]]; then - log_progress "Processed $count/$total_repos repos..." - fi - done < "$repo_list" - - # Clear progress line - echo "" >&2 - - # Summary - echo "" - log_info "=== Comparison Summary ===" - log_success "Archive ahead (use archive data): $count_archive_ahead" - log_success "In sync: $count_in_sync" - log_warn "Prod ahead (needs re-sync): $count_prod_ahead" - log_error "Diverged (manual review): $count_diverged" - log_info "Archive only: $count_archive_only" - log_info "Prod only: $count_prod_only" - log_info "Both empty: $count_both_empty" - echo "" - log_info "Total: $count repos" - log_info "Output: $tsv_file" -} - -main "$@" diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh deleted file mode 100755 index d762aae..0000000 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ /dev/null @@ -1,774 +0,0 @@ -#!/usr/bin/env bash -# -# 30-extract-parse-failures.sh - Extract parse failure events from systemd logs -# -# PHASE 4a of the GRASP relay to ngit-grasp migration analysis pipeline. -# Extracts structured [PARSE_FAIL] log entries AND "Invalid announcement" -# rejections from journalctl. -# -# USAGE: -# ./30-extract-parse-failures.sh [options] -# -# EXAMPLES: -# # Extract from ngit-grasp service (last 30 days, default) -# ./30-extract-parse-failures.sh ngit-grasp.service output/logs -# -# # Extract with custom time range -# ./30-extract-parse-failures.sh ngit-grasp.service output/logs --since "2026-01-01" -# -# # Extract from specific time window -# ./30-extract-parse-failures.sh ngit-grasp.service output/logs --since "2026-01-15" --until "2026-01-22" -# -# OPTIONS: -# --since Start date for log extraction (default: 30 days ago) -# --until End date for log extraction (default: now) -# --dry-run Show what would be extracted without writing files -# -# ENRICHMENT: -# The script automatically enriches parse failures with repo/npub information -# by extracting from "Added rejected announcement" log entries which include -# pubkey and identifier fields. Hex pubkeys are converted to npub format using -# `nak encode npub ` if the nak tool is available. -# -# OUTPUT: -# /parse-failures.txt -# -# OUTPUT FORMAT (TSV): -# event_idkindreasonreponpub -# -# EXPECTED LOG FORMATS: -# The script looks for three types of log entries: -# -# 1. Structured [PARSE_FAIL] entries: -# 2026-01-22T10:30:45Z ngit-grasp[1234]: [PARSE_FAIL] kind=30618 event_id=abc123... reason="invalid refs format" repo=myrepo npub=npub1... -# -# 2. "Invalid announcement" rejections (write policy): -# Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... -# -# 3. "Added rejected announcement" entries (for enrichment): -# Added rejected announcement to two-tier index event_id=abc123... kind=30617 identifier=myrepo pubkey=hex... -# These entries provide pubkey and identifier for enriching write policy rejections. -# -# NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted -# because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting -# both would cause double-counting since deduplication only works within each format. -# Write policy logs contain the same events, so we don't lose any data. -# -# Required fields: kind, event_id, reason -# Enrichment fields: repo (identifier), npub (converted from hex pubkey) -# -# DEPENDENCY: -# This script requires logging improvements in ngit-grasp to emit structured -# [PARSE_FAIL] log entries. Until those are implemented, this script will -# find no matching entries (which is handled gracefully). -# -# "Invalid announcement" rejections are logged by the write policy and -# should be present in any ngit-grasp deployment. -# -# See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section) -# -# Expected Rust logging code for [PARSE_FAIL]: -# tracing::warn!( -# target: "migration", -# "[PARSE_FAIL] kind={} event_id={} reason=\"{}\" repo={} npub={}", -# event.kind, event.id, reason, identifier, npub -# ); -# -# PREREQUISITES: -# - journalctl (systemd) -# - grep, awk, sed (standard Unix tools) -# - Access to systemd journal (may require sudo or journal group membership) -# -# RUNTIME: Depends on log volume, typically < 30 seconds -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 31-extract-purgatory-expiry.sh - Companion script for purgatory expiry logs -# - -set -euo pipefail - -# Get script directory for sourcing helpers -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Source the service validation helper -if [[ -f "$SCRIPT_DIR/validate-service.sh" ]]; then - source "$SCRIPT_DIR/validate-service.sh" -fi - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 [options]" - echo "" - echo "Arguments:" - echo " service-name Systemd service name (e.g., ngit-grasp.service)" - echo " output-dir Directory to store extracted log data" - echo "" - echo "Options:" - echo " --since Start date (default: 30 days ago)" - echo " --until End date (default: now)" - echo " --dry-run Show what would be extracted without writing" - echo "" - echo "Examples:" - echo " $0 ngit-grasp.service output/logs" - echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" - echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" - echo "" - echo "Expected log formats:" - echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." - echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." - echo "" - echo "Enrichment:" - echo " Parse failures are automatically enriched with repo/npub from" - echo " 'Added rejected announcement' log entries. Hex pubkeys are converted" - echo " to npub format using 'nak encode npub' if available." - exit 1 -} - -# ============================================================================= -# AWK-BASED BATCH PARSING FUNCTIONS -# ============================================================================= -# These functions use awk for efficient batch processing instead of per-line -# grep calls. This provides ~400x speedup for large log files. -# -# NOTE: parse_builder_rejection_line() was removed to fix double-counting bug. -# Builder logs use bech32 (note1) IDs while write policy logs use hex IDs. -# Since deduplication only works within each format, extracting both caused -# the same event to be counted twice. Write policy logs contain the same -# events, so we don't lose any data by only extracting from that source. - -# Parse [PARSE_FAIL] log lines in batch using awk -# Input: file containing log lines with [PARSE_FAIL] -# Output: TSV lines: event_idkindreasonreponpub -parse_parse_fail_batch() { - local input_file="$1" - awk ' - { - # Extract kind=VALUE - kind = "" - if (match($0, /kind=([0-9]+)/, m)) kind = m[1] - - # Extract event_id=VALUE (hex string) - event_id = "" - if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1] - - # Extract reason="VALUE" (quoted string) - reason = "" - if (match($0, /reason="([^"]*)"/, m)) reason = m[1] - - # Extract repo=VALUE (optional) - repo = "" - if (match($0, /repo=([^ ]+)/, m)) repo = m[1] - - # Extract npub=VALUE (optional) - npub = "" - if (match($0, /npub=([^ ]+)/, m)) npub = m[1] - - # Output if we have required fields - if (kind != "" && event_id != "" && reason != "") { - print event_id "\t" kind "\t" reason "\t" repo "\t" npub - } - } - ' "$input_file" -} - -# Parse "Invalid announcement" rejection log lines in batch using awk -# Input: file containing "Event rejected by write policy" log lines -# Output: TSV lines: event_idkindreason -parse_write_policy_rejection_batch() { - local input_file="$1" - awk ' - { - # Extract event_id=VALUE (hex string) - event_id = "" - if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1] - - # Extract kind=VALUE - kind = "" - if (match($0, /kind=([0-9]+)/, m)) kind = m[1] - - # Extract reason=VALUE (everything after "reason=") - reason = "" - if (match($0, /reason=(.*)$/, m)) reason = m[1] - - # Output if we have required fields (repo and npub are empty) - if (kind != "" && event_id != "" && reason != "") { - print event_id "\t" kind "\t" reason "\t\t" - } - } - ' "$input_file" -} - -# Parse "Added rejected announcement" log lines in batch using awk -# Input: file containing "Added rejected announcement to two-tier index" log lines -# Output: TSV lines: event_ididentifierpubkey_hex -parse_rejected_announcement_batch() { - local input_file="$1" - awk ' - { - # Extract event_id=VALUE (hex string) - event_id = "" - if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1] - - # Extract identifier=VALUE (repo name) - identifier = "" - if (match($0, /identifier=([^ ]+)/, m)) identifier = m[1] - - # Extract pubkey=VALUE (hex string) - pubkey = "" - if (match($0, /pubkey=([a-f0-9]+)/, m)) pubkey = m[1] - - # Output if we have all required fields - if (event_id != "" && identifier != "" && pubkey != "") { - print event_id "\t" identifier "\t" pubkey - } - } - ' "$input_file" -} - -# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries -# This is critical because "Invalid announcement" rejections only log event_id and kind, -# not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead -# of repo|npub in action-required.txt, making the output unusable. -# -# Arguments: -# $1 - parse failures file to enrich (modified in place) -# $2 - lookup file containing event_id -> identifier|pubkey mappings from logs -# -# The function: -# 1. Uses the lookup table built from "Added rejected announcement" log entries -# 2. For each parse failure with empty repo/npub, looks up the event_id -# 3. Populates repo and npub columns from the lookup -# 4. Converts hex pubkeys to npub format using `nak encode npub` if available -# -# OPTIMIZATION: This function uses batch processing for efficiency: -# - Uses awk for O(n) join instead of per-line grep (O(n*m)) -# - Batches all pubkey->npub conversions in a single nak call -# - This reduces runtime from minutes to seconds for large datasets -enrich_with_repo_npub() { - local parse_failures_file="$1" - local lookup_file="$2" - - # Validate lookup file exists and has content - if [[ ! -f "$lookup_file" ]] || [[ ! -s "$lookup_file" ]]; then - log_warn "No enrichment data available - repo/npub columns will remain empty" - return 0 - fi - - log_info "Enriching parse failures with repo/npub from log entries..." - - # Check if we have nak for pubkey->npub conversion - local can_convert_npub=false - if command -v nak &> /dev/null; then - can_convert_npub=true - log_info " Using 'nak' for pubkey->npub conversion" - else - log_warn " 'nak' not found - will use hex pubkeys instead of npub" - fi - - local lookup_count - lookup_count=$(wc -l < "$lookup_file") - lookup_count="${lookup_count//[^0-9]/}" - log_info " Lookup table has $lookup_count entries" - - # STEP 1: Extract unique pubkeys that need conversion - # Get pubkeys from lookup file (column 3), deduplicate - local unique_pubkeys_file npub_map_file - unique_pubkeys_file=$(mktemp) - npub_map_file=$(mktemp) - - cut -f3 "$lookup_file" | sort -u > "$unique_pubkeys_file" - local unique_pubkey_count - unique_pubkey_count=$(wc -l < "$unique_pubkeys_file") - unique_pubkey_count="${unique_pubkey_count//[^0-9]/}" - log_info " Converting $unique_pubkey_count unique pubkeys to npub format..." - - # STEP 2: Batch convert all pubkeys to npub in a single nak call - # nak reads hex pubkeys from stdin (one per line) and outputs npubs - if [[ "$can_convert_npub" == true && "$unique_pubkey_count" -gt 0 ]]; then - # Create mapping file: pubkey_hexnpub - # nak encode npub reads from stdin and outputs one npub per line - paste "$unique_pubkeys_file" <(nak encode npub < "$unique_pubkeys_file" 2>/dev/null) > "$npub_map_file" || { - # Fallback: if batch conversion fails, use hex pubkeys - log_warn " Batch npub conversion failed, using hex pubkeys" - awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file" - } - else - # No nak available, use hex pubkeys as-is - awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file" - fi - - rm -f "$unique_pubkeys_file" - - # STEP 3: Use awk for efficient join (O(n) instead of O(n*m) grep per line) - # This joins parse_failures with lookup_file on event_id, then with npub_map on pubkey - local enriched_file - enriched_file=$(mktemp) - - # Copy header lines - grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true - - # Use awk to perform the join efficiently - # Input files (order matters for ARGIND): - # 1. npub_map_file: pubkey_hexnpub - # 2. lookup_file: event_ididentifierpubkey_hex - # 3. parse_failures_file: event_idkindreasonreponpub - awk -F'\t' -v OFS='\t' ' - # Track which file we are processing - FNR==1 { file_num++ } - - # First file: npub_map (pubkey_hex -> npub) - file_num==1 { - npub_map[$1] = $2 - next - } - # Second file: lookup (event_id -> identifier, pubkey_hex) - file_num==2 { - lookup_repo[$1] = $2 - lookup_pubkey[$1] = $3 - next - } - # Third file: parse_failures - /^#/ { next } # Skip headers (already copied) - { - event_id = $1 - kind = $2 - reason = $3 - repo = $4 - npub = $5 - - # If repo/npub empty, try to enrich from lookup - if (repo == "" && event_id in lookup_repo) { - repo = lookup_repo[event_id] - } - if (npub == "" && event_id in lookup_pubkey) { - pubkey = lookup_pubkey[event_id] - if (pubkey in npub_map) { - npub = npub_map[pubkey] - } else { - npub = pubkey # Fallback to hex - } - } - - print event_id, kind, reason, repo, npub - } - ' "$npub_map_file" "$lookup_file" "$parse_failures_file" >> "$enriched_file" - - rm -f "$npub_map_file" - - # Count enriched entries - local enriched_count total_count - total_count=$(grep -v '^#' "$parse_failures_file" | wc -l) - total_count="${total_count//[^0-9]/}" - # Count entries that have non-empty repo AND npub after enrichment - enriched_count=$(grep -v '^#' "$enriched_file" | awk -F'\t' '$4 != "" && $5 != ""' | wc -l) - enriched_count="${enriched_count//[^0-9]/}" - - # Replace original with enriched version - mv "$enriched_file" "$parse_failures_file" - - log_info " Enriched $enriched_count of $total_count parse failures with repo/npub" - log_success "Enrichment complete" -} - -# Parse "Added rejected announcement" log entries to build enrichment lookup table -# Input: log line containing "Added rejected announcement to two-tier index" -# Output: TSV line: event_ididentifierpubkey_hex -parse_rejected_announcement_line() { - local line="$1" - - local event_id identifier pubkey_hex - - # Extract event_id=VALUE (hex string) - event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "") - - # Extract identifier=VALUE (repo name) - identifier=$(echo "$line" | grep -oP 'identifier=\K[^ ]+' || echo "") - - # Extract pubkey=VALUE (hex string) - pubkey_hex=$(echo "$line" | grep -oP 'pubkey=\K[a-f0-9]+' || echo "") - - # Only output if we have all required fields - if [[ -n "$event_id" && -n "$identifier" && -n "$pubkey_hex" ]]; then - printf '%s\t%s\t%s\n' "$event_id" "$identifier" "$pubkey_hex" - fi -} - -# Main -main() { - if [[ $# -lt 2 ]]; then - usage - fi - - local service="$1" - local output_dir="$2" - shift 2 - - # Default time range: last 30 days - local since_date - since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") - local until_date="" - local dry_run=false - - # Parse options - while [[ $# -gt 0 ]]; do - case "$1" in - --since) - since_date="$2" - shift 2 - ;; - --until) - until_date="$2" - shift 2 - ;; - --dry-run) - dry_run=true - shift - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac - done - - # Validate service name format - if [[ ! "$service" =~ \.service$ ]]; then - service="${service}.service" - fi - - # Validate service is appropriate for structured logging - # This prevents the common mistake of using ngit-relay instead of ngit-grasp - if type validate_service_for_structured_logging &>/dev/null; then - # Use non-interactive mode if not a terminal, skip log check (we'll do our own) - local interactive="true" - [[ ! -t 0 ]] && interactive="false" - - if ! validate_service_for_structured_logging "$service" "false" "$interactive"; then - log_error "Service validation failed. Use an ngit-grasp service for structured logging." - exit 1 - fi - else - # Fallback validation if helper not available - if [[ "$service" == *"ngit-relay"* ]]; then - log_error "Service name appears to be ngit-relay: $service" - log_error "Structured logging ([PARSE_FAIL]) only exists in ngit-grasp services." - log_error "Please use the ngit-grasp archive service instead." - log_error "" - log_error "To find the correct service:" - log_error " systemctl list-units 'ngit-grasp*' --all" - exit 1 - fi - fi - - log_info "Extracting parse failures from systemd logs" - log_info "Service: $service" - log_info "Output: $output_dir" - log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" - - # Check if journalctl is available - if ! command -v journalctl &> /dev/null; then - log_error "journalctl not found. This script requires systemd." - exit 1 - fi - - # Validate service exists (check if journalctl can find any logs for it) - # Note: We don't require the service to be running, just that it has logs - if ! journalctl --no-pager -u "$service" -n 1 &>/dev/null; then - log_warn "Could not query logs for service: $service" - log_warn "This may indicate the service doesn't exist or you lack permissions." - log_warn "" - log_warn "To list available ngit-grasp services:" - log_warn " systemctl list-units 'ngit-grasp*' --all" - log_warn " journalctl --list-boots # Check if you have journal access" - log_warn "" - # Continue anyway - the service might exist but have no logs yet - fi - - # Build journalctl command - local journal_cmd="journalctl -u $service --no-pager -o short-iso" - - if [[ -n "$since_date" ]]; then - journal_cmd="$journal_cmd --since '$since_date'" - fi - - if [[ -n "$until_date" ]]; then - journal_cmd="$journal_cmd --until '$until_date'" - fi - - log_info "Running: $journal_cmd | grep '[PARSE_FAIL]' or 'Invalid announcement'" - - if [[ "$dry_run" == true ]]; then - log_info "[DRY RUN] Would extract to: $output_dir/parse-failures.txt" - - # Show sample of what would be extracted - log_info "Checking for matching log entries..." - local parse_fail_count invalid_announcement_count - parse_fail_count=$(eval "$journal_cmd" 2>/dev/null | grep -c '\[PARSE_FAIL\]' || echo "0") - parse_fail_count="${parse_fail_count//[^0-9]/}" # Strip non-numeric characters - parse_fail_count="${parse_fail_count:-0}" - - invalid_announcement_count=$(eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep -c 'Invalid announcement' || echo "0") - invalid_announcement_count="${invalid_announcement_count//[^0-9]/}" - invalid_announcement_count="${invalid_announcement_count:-0}" - - log_info "Found $parse_fail_count [PARSE_FAIL] entries" - log_info "Found $invalid_announcement_count 'Invalid announcement' rejections" - - if [[ "$parse_fail_count" -eq 0 && "$invalid_announcement_count" -eq 0 ]]; then - log_warn "No matching entries found in logs." - log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." - log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" - fi - - exit 0 - fi - - # Create output directory - mkdir -p "$output_dir" - - local output_file="$output_dir/parse-failures.txt" - local temp_file - temp_file=$(mktemp) - - # Extract and parse log entries using streaming (avoids loading all logs into memory) - log_info "Extracting log entries..." - - # Create temp files for intermediate results - local temp_stderr temp_parse_fail temp_write_policy_rejection temp_rejected_announcement - temp_stderr=$(mktemp) - temp_parse_fail=$(mktemp) - temp_write_policy_rejection=$(mktemp) - temp_rejected_announcement=$(mktemp) - - # Extract [PARSE_FAIL] entries directly to temp file (streaming) - log_info " Searching for [PARSE_FAIL] entries..." - eval "$journal_cmd" 2>"$temp_stderr" | grep '\[PARSE_FAIL\]' > "$temp_parse_fail" || true - - local journal_stderr - journal_stderr=$(cat "$temp_stderr" 2>/dev/null || true) - if [[ -n "$journal_stderr" ]]; then - log_warn "journalctl reported: $journal_stderr" - fi - - # Extract "Event rejected by write policy" with "Invalid announcement" (streaming) - # NOTE: We only extract from write policy logs (hex IDs), not builder logs (note1 IDs) - # to avoid double-counting. Both log sources contain the same events. - log_info " Searching for write policy rejections..." - eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true - - # Extract "Added rejected announcement" entries for enrichment (streaming) - # These contain pubkey and identifier which we use to enrich write policy rejections - log_info " Searching for rejected announcement entries (for enrichment)..." - eval "$journal_cmd" 2>/dev/null | grep 'Added rejected announcement to two-tier index' > "$temp_rejected_announcement" || true - - rm -f "$temp_stderr" - - # Check if we found anything - local parse_fail_line_count write_policy_line_count rejected_announcement_line_count - parse_fail_line_count=$(wc -l < "$temp_parse_fail") - parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" - write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") - write_policy_line_count="${write_policy_line_count//[^0-9]/}" - rejected_announcement_line_count=$(wc -l < "$temp_rejected_announcement") - rejected_announcement_line_count="${rejected_announcement_line_count//[^0-9]/}" - - log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" - log_info " Found $write_policy_line_count write policy rejection log lines" - log_info " Found $rejected_announcement_line_count rejected announcement log lines (for enrichment)" - - local total_invalid_announcement_lines=$write_policy_line_count - - if [[ "$parse_fail_line_count" -eq 0 && "$total_invalid_announcement_lines" -eq 0 ]]; then - log_warn "No matching entries found in logs." - log_warn "" - log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." - log_warn "The script looks for:" - log_warn "" - log_warn " 1. [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." - log_warn " 2. Event rejected by write policy event_id=... kind=30617 reason=Invalid announcement: ..." - log_warn "" - log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" - log_warn "" - - # Create empty output file with header comment - { - echo "# Parse failures and invalid announcements extracted from $service" - echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" - echo "# Extracted: $(date -Iseconds)" - echo "#" - echo "# Includes:" - echo "# - [PARSE_FAIL] structured log entries" - echo "# - \"Invalid announcement\" rejections" - echo "#" - echo "# Format: event_idkindreasonreponpub" - echo "# Note: repo and npub may be empty for some entries" - echo "#" - echo "# NOTE: No matching entries found." - echo "# This is expected if ngit-grasp logging improvements are not yet deployed." - } > "$output_file" - - rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" - log_info "Created empty output file: $output_file" - exit 0 - fi - - # Write header - { - echo "# Parse failures and invalid announcements extracted from $service" - echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" - echo "# Extracted: $(date -Iseconds)" - echo "#" - echo "# Includes:" - echo "# - [PARSE_FAIL] structured log entries" - echo "# - \"Invalid announcement\" rejections" - echo "#" - echo "# Format: event_idkindreasonreponpub" - echo "# Note: repo and npub may be empty for some entries" - } > "$output_file" - - # Parse [PARSE_FAIL] entries using batch awk processing - log_info " Parsing [PARSE_FAIL] entries..." - local parse_fail_count=0 - if [[ "$parse_fail_line_count" -gt 0 ]]; then - parse_parse_fail_batch "$temp_parse_fail" >> "$output_file" - parse_fail_count=$(grep -v '^#' "$output_file" | wc -l) - parse_fail_count="${parse_fail_count//[^0-9]/}" - fi - - # Parse write policy rejection entries using batch awk processing - log_info " Parsing write policy rejection entries..." - local write_policy_count=0 - if [[ "$write_policy_line_count" -gt 0 ]]; then - local before_count - before_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0") - before_count="${before_count//[^0-9]/}" - before_count="${before_count:-0}" - parse_write_policy_rejection_batch "$temp_write_policy_rejection" >> "$output_file" - local after_count - after_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0") - after_count="${after_count//[^0-9]/}" - after_count="${after_count:-0}" - write_policy_count=$((after_count - before_count)) - fi - - local invalid_announcement_count=$write_policy_count - - # Build enrichment lookup table from "Added rejected announcement" entries - local enrichment_lookup_file - enrichment_lookup_file=$(mktemp) - - log_info " Building enrichment lookup table..." - if [[ "$rejected_announcement_line_count" -gt 0 ]]; then - parse_rejected_announcement_batch "$temp_rejected_announcement" > "$enrichment_lookup_file" - fi - - rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" - - # Deduplicate by event_id (first column) - keep first occurrence - log_info " Deduplicating entries..." - local deduped_file - deduped_file=$(mktemp) - # Preserve header lines (starting with #) and deduplicate data lines - grep '^#' "$output_file" > "$deduped_file" - grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" - mv "$deduped_file" "$output_file" - - # Deduplicate enrichment lookup table by event_id - if [[ -s "$enrichment_lookup_file" ]]; then - sort -t$'\t' -k1,1 -u "$enrichment_lookup_file" > "$enrichment_lookup_file.deduped" - mv "$enrichment_lookup_file.deduped" "$enrichment_lookup_file" - fi - - # Enrich with repo/npub from "Added rejected announcement" log entries - # This is critical for usability - without it, action-required.txt shows - # event_id|kind instead of repo|npub, making parse failures unidentifiable - enrich_with_repo_npub "$output_file" "$enrichment_lookup_file" - - rm -f "$enrichment_lookup_file" - - # Count final entries (excluding header lines) - local count - count=$(grep -v '^#' "$output_file" | wc -l) - count="${count//[^0-9]/}" # Strip whitespace - count="${count:-0}" - - rm -f "$temp_file" - - # Summary - echo "" - log_info "=== Extraction Summary ===" - log_info "Service: $service" - log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" - log_success "Extracted $count total entries" - log_info " - [PARSE_FAIL] entries: $parse_fail_count" - log_info " - Invalid announcement rejections: $invalid_announcement_count" - echo "" - log_info "Output file: $output_file" - - if [[ $count -gt 0 ]]; then - echo "" - log_info "Sample entries (first 5):" - # Use a subshell to avoid SIGPIPE issues with set -e - # New format: event_idkindreasonreponpub - (grep -v '^#' "$output_file" | head -5 | while IFS=$'\t' read -r event_id kind reason repo npub; do - echo " kind=$kind event_id=${event_id:0:16}... reason=\"${reason:0:60}...\"" - done) || true - fi - - # Breakdown by kind - if [[ $count -gt 0 ]]; then - echo "" - log_info "Breakdown by event kind:" - # Use a subshell to avoid SIGPIPE issues with set -e - # kind is now column 2 - (grep -v '^#' "$output_file" | awk -F'\t' '{print $2}' | sort | uniq -c | sort -rn | while read -r cnt kind; do - echo " kind $kind: $cnt failures" - done) || true - fi - - # Breakdown by reason pattern (for invalid announcements) - if [[ $invalid_announcement_count -gt 0 ]]; then - echo "" - log_info "Breakdown by reason pattern:" - # Extract the main reason type (before the colon details) - (grep -v '^#' "$output_file" | awk -F'\t' '{print $3}' | sed 's/:.*//' | sort | uniq -c | sort -rn | head -10 | while read -r cnt reason; do - echo " $reason: $cnt" - done) || true - fi - - # Explicit success exit - exit 0 -} - -main "$@" diff --git a/docs/how-to/migration-scripts/31-extract-purgatory-expiry.sh b/docs/how-to/migration-scripts/31-extract-purgatory-expiry.sh deleted file mode 100755 index a0c8ad0..0000000 --- a/docs/how-to/migration-scripts/31-extract-purgatory-expiry.sh +++ /dev/null @@ -1,408 +0,0 @@ -#!/usr/bin/env bash -# -# 31-extract-purgatory-expiry.sh - Extract purgatory expiry events from systemd logs -# -# PHASE 4b of the GRASP relay to ngit-grasp migration analysis pipeline. -# Extracts structured [PURGATORY_EXPIRED] log entries from journalctl. -# -# USAGE: -# ./31-extract-purgatory-expiry.sh [options] -# -# EXAMPLES: -# # Extract from ngit-grasp service (last 30 days, default) -# ./31-extract-purgatory-expiry.sh ngit-grasp.service output/logs -# -# # Extract with custom time range -# ./31-extract-purgatory-expiry.sh ngit-grasp.service output/logs --since "2026-01-01" -# -# # Extract from specific time window -# ./31-extract-purgatory-expiry.sh ngit-grasp.service output/logs --since "2026-01-15" --until "2026-01-22" -# -# OPTIONS: -# --since Start date for log extraction (default: 30 days ago) -# --until End date for log extraction (default: now) -# --dry-run Show what would be extracted without writing files -# -# OUTPUT: -# /purgatory-expired.txt -# -# OUTPUT FORMAT (TSV): -# reponpubtimestampreason -# -# EXPECTED LOG FORMAT: -# The script looks for structured log entries in this format: -# -# 2026-01-22T10:30:45Z ngit-grasp[1234]: [PURGATORY_EXPIRED] repo=myrepo npub=npub1... reason="clone URL unreachable after 7 days" -# -# Required fields: repo, npub -# Optional fields: reason (explains why purgatory expired) -# -# BACKGROUND: -# "Purgatory" is the state where ngit-grasp has received an announcement event -# but cannot yet sync the git data (e.g., clone URL unreachable, git server down). -# After a configurable timeout (default 7 days), the repository is marked as -# expired and removed from purgatory. -# -# Purgatory expiry during migration analysis indicates repositories that: -# - Had valid announcements on the production relay -# - Could not be synced to the archive relay -# - May need manual intervention or investigation -# -# DEPENDENCY: -# This script requires logging improvements in ngit-grasp to emit structured -# [PURGATORY_EXPIRED] log entries. Until those are implemented, this script -# will find no matching entries (which is handled gracefully). -# -# See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section) -# -# Expected Rust logging code: -# tracing::warn!( -# target: "migration", -# "[PURGATORY_EXPIRED] repo={} npub={} reason=\"{}\"", -# identifier, npub, reason -# ); -# -# PREREQUISITES: -# - journalctl (systemd) -# - grep, awk (standard Unix tools) -# - Access to systemd journal (may require sudo or journal group membership) -# -# RUNTIME: Depends on log volume, typically < 30 seconds -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 30-extract-parse-failures.sh - Companion script for parse failure logs -# - -set -euo pipefail - -# Get script directory for sourcing helpers -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Source the service validation helper -if [[ -f "$SCRIPT_DIR/validate-service.sh" ]]; then - source "$SCRIPT_DIR/validate-service.sh" -fi - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - NC='' -fi - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -usage() { - echo "Usage: $0 [options]" - echo "" - echo "Arguments:" - echo " service-name Systemd service name (e.g., ngit-grasp.service)" - echo " output-dir Directory to store extracted log data" - echo "" - echo "Options:" - echo " --since Start date (default: 30 days ago)" - echo " --until End date (default: now)" - echo " --dry-run Show what would be extracted without writing" - echo "" - echo "Examples:" - echo " $0 ngit-grasp.service output/logs" - echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" - echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" - echo "" - echo "Expected log format:" - echo " [PURGATORY_EXPIRED] repo=myrepo npub=npub1... reason=\"...\"" - exit 1 -} - -# Parse a single log line and extract fields -# Input: log line containing [PURGATORY_EXPIRED] -# Output: TSV line: reponpubtimestampreason -parse_log_line() { - local line="$1" - - # Extract timestamp from the beginning of the log line - # Format: 2026-01-22T10:30:45+0000 or similar ISO format - local timestamp repo npub reason - - # Extract ISO timestamp from beginning of line - timestamp=$(echo "$line" | grep -oP '^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}' || echo "") - - # Extract repo=VALUE (unquoted identifier) - repo=$(echo "$line" | grep -oP 'repo=\K[^ ]+' || echo "") - - # Extract npub=VALUE (npub1... format) - npub=$(echo "$line" | grep -oP 'npub=\K[^ ]+' || echo "") - - # Extract reason="VALUE" (quoted string, optional) - reason=$(echo "$line" | grep -oP 'reason="\K[^"]*' || echo "") - - # Only output if we have the required fields - if [[ -n "$repo" && -n "$npub" ]]; then - printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$timestamp" "$reason" - fi -} - -# Main -main() { - if [[ $# -lt 2 ]]; then - usage - fi - - local service="$1" - local output_dir="$2" - shift 2 - - # Default time range: last 30 days - local since_date - since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") - local until_date="" - local dry_run=false - - # Parse options - while [[ $# -gt 0 ]]; do - case "$1" in - --since) - since_date="$2" - shift 2 - ;; - --until) - until_date="$2" - shift 2 - ;; - --dry-run) - dry_run=true - shift - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac - done - - # Validate service name format - if [[ ! "$service" =~ \.service$ ]]; then - service="${service}.service" - fi - - # Validate service is appropriate for structured logging - # This prevents the common mistake of using ngit-relay instead of ngit-grasp - if type validate_service_for_structured_logging &>/dev/null; then - # Use non-interactive mode if not a terminal, skip log check (we'll do our own) - local interactive="true" - [[ ! -t 0 ]] && interactive="false" - - if ! validate_service_for_structured_logging "$service" "false" "$interactive"; then - log_error "Service validation failed. Use an ngit-grasp service for structured logging." - exit 1 - fi - else - # Fallback validation if helper not available - if [[ "$service" == *"ngit-relay"* ]]; then - log_error "Service name appears to be ngit-relay: $service" - log_error "Structured logging ([PURGATORY_EXPIRED]) only exists in ngit-grasp services." - log_error "Please use the ngit-grasp archive service instead." - log_error "" - log_error "To find the correct service:" - log_error " systemctl list-units 'ngit-grasp*' --all" - exit 1 - fi - fi - - log_info "Extracting purgatory expiry events from systemd logs" - log_info "Service: $service" - log_info "Output: $output_dir" - log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" - - # Check if journalctl is available - if ! command -v journalctl &> /dev/null; then - log_error "journalctl not found. This script requires systemd." - exit 1 - fi - - # Validate service exists (check if journalctl can find any logs for it) - # Note: We don't require the service to be running, just that it has logs - if ! journalctl --no-pager -u "$service" -n 1 &>/dev/null; then - log_warn "Could not query logs for service: $service" - log_warn "This may indicate the service doesn't exist or you lack permissions." - log_warn "" - log_warn "To list available ngit-grasp services:" - log_warn " systemctl list-units 'ngit-grasp*' --all" - log_warn " journalctl --list-boots # Check if you have journal access" - log_warn "" - # Continue anyway - the service might exist but have no logs yet - fi - - # Build journalctl command - local journal_cmd="journalctl -u $service --no-pager -o short-iso" - - if [[ -n "$since_date" ]]; then - journal_cmd="$journal_cmd --since '$since_date'" - fi - - if [[ -n "$until_date" ]]; then - journal_cmd="$journal_cmd --until '$until_date'" - fi - - log_info "Running: $journal_cmd | grep '\\[PURGATORY_EXPIRED\\]'" - - if [[ "$dry_run" == true ]]; then - log_info "[DRY RUN] Would extract to: $output_dir/purgatory-expired.txt" - - # Show sample of what would be extracted - log_info "Checking for matching log entries..." - local sample_count - sample_count=$(eval "$journal_cmd" 2>/dev/null | grep -c '\[PURGATORY_EXPIRED\]' || echo "0") - sample_count="${sample_count//[^0-9]/}" # Strip non-numeric characters - sample_count="${sample_count:-0}" - log_info "Found $sample_count matching log entries" - - if [[ "$sample_count" -eq 0 ]]; then - log_warn "No [PURGATORY_EXPIRED] entries found in logs." - log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." - log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" - fi - - exit 0 - fi - - # Create output directory - mkdir -p "$output_dir" - - local output_file="$output_dir/purgatory-expired.txt" - local temp_file - temp_file=$(mktemp) - - # Extract and parse log entries - log_info "Extracting log entries..." - - # Get raw log lines containing [PURGATORY_EXPIRED] - # Capture stderr separately to detect journalctl errors - local raw_lines journal_stderr journal_exit - local temp_stderr - temp_stderr=$(mktemp) - - raw_lines=$(eval "$journal_cmd" 2>"$temp_stderr" | grep '\[PURGATORY_EXPIRED\]' || true) - journal_exit=$? - journal_stderr=$(cat "$temp_stderr" 2>/dev/null || true) - rm -f "$temp_stderr" - - # Report any journalctl errors (but don't fail - empty logs are valid) - if [[ -n "$journal_stderr" ]]; then - log_warn "journalctl reported: $journal_stderr" - fi - - if [[ -z "$raw_lines" ]]; then - log_warn "No [PURGATORY_EXPIRED] entries found in logs." - log_warn "" - log_warn "This is expected if ngit-grasp logging improvements are not yet deployed." - log_warn "The structured log format required by this script:" - log_warn "" - log_warn " [PURGATORY_EXPIRED] repo=myrepo npub=npub1... reason=\"...\"" - log_warn "" - log_warn "See: docs/how-to/migrate-to-ngit-grasp.md (Dependencies section)" - log_warn "" - - # Create empty output file with header comment - { - echo "# Purgatory expiry events extracted from $service" - echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" - echo "# Extracted: $(date -Iseconds)" - echo "# Format: reponpubtimestampreason" - echo "#" - echo "# NOTE: No [PURGATORY_EXPIRED] entries found." - echo "# This is expected if ngit-grasp logging improvements are not yet deployed." - } > "$output_file" - - log_info "Created empty output file: $output_file" - exit 0 - fi - - # Write header - { - echo "# Purgatory expiry events extracted from $service" - echo "# Time range: ${since_date:-beginning} to ${until_date:-now}" - echo "# Extracted: $(date -Iseconds)" - echo "# Format: reponpubtimestampreason" - } > "$output_file" - - # Parse each line - local count=0 - while IFS= read -r line; do - local parsed - parsed=$(parse_log_line "$line") - if [[ -n "$parsed" ]]; then - echo "$parsed" >> "$output_file" - count=$((count + 1)) - fi - done <<< "$raw_lines" - - rm -f "$temp_file" - - # Summary - echo "" - log_info "=== Extraction Summary ===" - log_info "Service: $service" - log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" - log_success "Extracted $count purgatory expiry entries" - echo "" - log_info "Output file: $output_file" - - if [[ $count -gt 0 ]]; then - echo "" - log_info "Sample entries (first 5):" - # Use a subshell to avoid SIGPIPE issues with set -e - (tail -n +5 "$output_file" | head -5 | while IFS=$'\t' read -r repo npub timestamp reason; do - echo " repo=$repo npub=${npub:0:20}... timestamp=$timestamp" - done) || true - fi - - # Show unique repos affected - if [[ $count -gt 0 ]]; then - echo "" - local unique_repos - unique_repos=$(tail -n +5 "$output_file" | awk -F'\t' '{print $1}' | sort -u | wc -l) - log_info "Unique repositories affected: $unique_repos" - - echo "" - log_info "Repositories with purgatory expiry:" - # Use a subshell to avoid SIGPIPE issues with set -e - (tail -n +5 "$output_file" | awk -F'\t' '{print $1}' | sort | uniq -c | sort -rn | head -10 | while read -r cnt repo; do - echo " $repo: $cnt expiry events" - done) || true - - local total_repos - total_repos=$(tail -n +5 "$output_file" | awk -F'\t' '{print $1}' | sort -u | wc -l) - if [[ $total_repos -gt 10 ]]; then - echo " ... and $((total_repos - 10)) more repositories" - fi - fi - - # Explicit success exit - exit 0 -} - -main "$@" diff --git a/docs/how-to/migration-scripts/40-classify-actions.sh b/docs/how-to/migration-scripts/40-classify-actions.sh deleted file mode 100755 index 8b61636..0000000 --- a/docs/how-to/migration-scripts/40-classify-actions.sh +++ /dev/null @@ -1,662 +0,0 @@ -#!/usr/bin/env bash -# -# 40-classify-actions.sh - Classify repos by migration action required -# -# Implements the redesigned classification system (Option B) with user feedback: -# -# Tier 1: No Action Required (ready-for-migration.txt) -# - Complete in both (prod=cat1, archive=cat1) -# - Deleted by user (kind 5 event) -# - Empty in prod (prod=cat2, any archive status) -# - Archive-only (archive=any, prod=missing) -# - Not in prod (purgatory-only, prod=missing) -# - Archive ahead (archive has newer git data than prod - GRASP enforced) -# -# Tier 2: Action Required (needs-resync.txt) -# - Complete in prod, missing from archive (with purgatory context) -# - Complete in prod, incomplete in archive AND prod is ahead (with purgatory context) -# -# Tier 3: Manual Investigation (manual-review.txt) -# - Partial in prod (prod=cat3) -# - No-match in prod (prod=cat4) -# - Parse failures -# - Conflicting states -# - Diverged git history (both have unique commits) -# -# KEY INSIGHT: -# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event. -# If archive has different/newer data than prod, it means: -# - A state event authorized those commits at some point -# - Archive is actually MORE up-to-date than prod -# - Migration should use archive data (it's already correct) -# -# Usage: ./40-classify-actions.sh -# -# Output format: repo | npub | prod_status | archive_status | context | action -# - -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -log_info() { echo -e "${BLUE}[INFO]${NC} $*"; } -log_success() { echo -e "${GREEN}[OK]${NC} $*"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } -log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } - -# Check arguments -if [[ $# -lt 1 ]]; then - echo "Usage: $0 " - echo "Example: $0 work/migration-analysis-20260123-200701" - exit 1 -fi - -ANALYSIS_DIR="$1" - -# Validate analysis directory -if [[ ! -d "$ANALYSIS_DIR" ]]; then - log_error "Analysis directory not found: $ANALYSIS_DIR" - exit 1 -fi - -# Define paths -PROD_DIR="$ANALYSIS_DIR/prod" -ARCHIVE_DIR="$ANALYSIS_DIR/archive" -COMPARISON_DIR="$ANALYSIS_DIR/comparison" -LOGS_DIR="$ANALYSIS_DIR/logs" -RESULTS_DIR="$ANALYSIS_DIR/results" - -# Validate required directories -for dir in "$PROD_DIR" "$ARCHIVE_DIR" "$COMPARISON_DIR" "$LOGS_DIR"; do - if [[ ! -d "$dir" ]]; then - log_error "Required directory not found: $dir" - exit 1 - fi -done - -# Create results directory -mkdir -p "$RESULTS_DIR" - -# Output files -READY_FILE="$RESULTS_DIR/ready-for-migration.txt" -RESYNC_FILE="$RESULTS_DIR/needs-resync.txt" -REVIEW_FILE="$RESULTS_DIR/manual-review.txt" -SUMMARY_FILE="$RESULTS_DIR/summary.txt" - -# Temporary files for processing -TMP_DIR=$(mktemp -d) -trap 'rm -rf "$TMP_DIR"' EXIT - -log_info "Starting classification with revised system (Option B)" -log_info "Analysis directory: $ANALYSIS_DIR" - -# ============================================================================ -# Phase 1: Build lookup tables from source data -# ============================================================================ - -log_info "Building lookup tables..." - -# Build prod category lookup: repo|npub -> category -declare -A PROD_CAT -while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do - repo="${repo// /}" # Remove all spaces - npub="${npub// /}" # Remove all spaces - [[ -z "$repo" || -z "$npub" ]] && continue - PROD_CAT["$repo|$npub"]="cat1" -done < "$PROD_DIR/category1-complete-match.txt" - -while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - PROD_CAT["$repo|$npub"]="cat2" -done < "$PROD_DIR/category2-empty-blank.txt" - -while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - PROD_CAT["$repo|$npub"]="cat3" -done < "$PROD_DIR/category3-partial-match.txt" - -while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - PROD_CAT["$repo|$npub"]="cat4" -done < "$PROD_DIR/category4-no-match.txt" - -log_info "Loaded ${#PROD_CAT[@]} prod entries" - -# Build archive category lookup: repo|npub -> category -declare -A ARCHIVE_CAT -while IFS='|' read -r repo npub rest; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - ARCHIVE_CAT["$repo|$npub"]="cat1" -done < "$ARCHIVE_DIR/category1-complete-match.txt" - -while IFS='|' read -r repo npub rest; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - ARCHIVE_CAT["$repo|$npub"]="cat2" -done < "$ARCHIVE_DIR/category2-empty-blank.txt" - -while IFS='|' read -r repo npub rest; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - ARCHIVE_CAT["$repo|$npub"]="cat3" -done < "$ARCHIVE_DIR/category3-partial-match.txt" - -while IFS='|' read -r repo npub rest; do - repo="${repo// /}" - npub="${npub// /}" - [[ -z "$repo" || -z "$npub" ]] && continue - ARCHIVE_CAT["$repo|$npub"]="cat4" -done < "$ARCHIVE_DIR/category4-no-match.txt" - -log_info "Loaded ${#ARCHIVE_CAT[@]} archive entries" - -# Build purgatory lookup: repo|npub -> 1 (if purgatory expired) -declare -A PURGATORY -PURGATORY_COUNT=0 -if [[ -f "$LOGS_DIR/purgatory-expired.txt" ]]; then - while IFS=$'\t' read -r repo npub timestamp reason || [[ -n "$repo" ]]; do - # Skip comments and empty lines - [[ "$repo" =~ ^# ]] && continue - [[ -z "$repo" || -z "$npub" ]] && continue - PURGATORY["$repo|$npub"]=1 - PURGATORY_COUNT=$((PURGATORY_COUNT + 1)) - done < "$LOGS_DIR/purgatory-expired.txt" -fi -log_info "Loaded $PURGATORY_COUNT purgatory entries" - -# Build parse failure lookup: repo|npub -> 1 (if parse failure logged) -# Parse failures file format: event_idkindreasonreponpub -declare -A PARSE_FAIL -PARSE_FAIL_COUNT=0 -if [[ -f "$LOGS_DIR/parse-failures.txt" ]]; then - while IFS=$'\t' read -r event_id kind reason repo npub || [[ -n "$event_id" ]]; do - # Skip comments and empty lines - [[ "$event_id" =~ ^# ]] && continue - [[ -z "$repo" || -z "$npub" ]] && continue - PARSE_FAIL["$repo|$npub"]=1 - PARSE_FAIL_COUNT=$((PARSE_FAIL_COUNT + 1)) - done < "$LOGS_DIR/parse-failures.txt" -fi -log_info "Loaded $PARSE_FAIL_COUNT parse failure entries" - -# Build deletion lookup: repo|npub -> 1 (if kind 5 deletion event) -# Deletions are in NDJSON format with "a" tags like "30617:pubkey_hex:repo" -# We need to convert hex pubkeys to npub format using nak -declare -A DELETED - -# Helper function to process deletion file (NDJSON format) -# Extracts unique pubkey_hex:repo pairs and converts to npub -process_deletions() { - local file="$1" - [[ ! -f "$file" ]] && return - - # Extract unique pubkey_hex|repo pairs from NDJSON - # Each line is a JSON object, extract "a" tags - local pairs - pairs=$(jq -r '.tags[] | select(.[0] == "a") | .[1]' "$file" 2>/dev/null | \ - sed 's/^30617://' | awk -F: '{print $1 "|" $2}' | sort -u) - - # Get unique hex pubkeys for batch conversion - local hex_keys - hex_keys=$(echo "$pairs" | cut -d'|' -f1 | sort -u) - - # Build hex->npub lookup via batch nak call - declare -A HEX_TO_NPUB - while read -r hex; do - [[ -z "$hex" ]] && continue - local npub - npub=$(nak encode npub "$hex" 2>/dev/null || echo "") - [[ -n "$npub" ]] && HEX_TO_NPUB["$hex"]="$npub" - done <<< "$hex_keys" - - # Now process pairs with cached npub values - while IFS='|' read -r pubkey_hex repo; do - [[ -z "$repo" || -z "$pubkey_hex" ]] && continue - local npub="${HEX_TO_NPUB[$pubkey_hex]:-}" - [[ -z "$npub" ]] && continue - DELETED["$repo|$npub"]=1 - done <<< "$pairs" -} - -# Process prod and archive deletions -process_deletions "$PROD_DIR/raw/deletions.json" -process_deletions "$ARCHIVE_DIR/raw/deletions.json" -DELETED_COUNT=0 -[[ ${#DELETED[@]} -gt 0 ]] && DELETED_COUNT=${#DELETED[@]} -log_info "Loaded $DELETED_COUNT deletion entries" - -# Build git ancestry lookup: repo|npub -> relationship (archive-ahead, prod-ahead, diverged, etc.) -# This data comes from 22-compare-git-data.sh which compares actual git commits -declare -A GIT_ANCESTRY -GIT_ANCESTRY_COUNT=0 -if [[ -f "$COMPARISON_DIR/git-ancestry.tsv" ]]; then - while IFS=$'\t' read -r repo npub relationship details || [[ -n "$repo" ]]; do - # Skip header and comments - [[ "$repo" == "repo" ]] && continue - [[ "$repo" =~ ^# ]] && continue - [[ -z "$repo" || -z "$npub" ]] && continue - GIT_ANCESTRY["$repo|$npub"]="$relationship" - GIT_ANCESTRY_COUNT=$((GIT_ANCESTRY_COUNT + 1)) - done < "$COMPARISON_DIR/git-ancestry.tsv" - log_info "Loaded $GIT_ANCESTRY_COUNT git ancestry entries" -else - log_warn "No git-ancestry.tsv found - will not check if archive is ahead of prod" - log_warn "Run 22-compare-git-data.sh to enable archive-ahead detection" -fi - -# ============================================================================ -# Phase 2: Build unique repo list from all sources -# ============================================================================ - -log_info "Building unique repo list..." - -declare -A ALL_REPOS -for key in "${!PROD_CAT[@]}"; do - ALL_REPOS["$key"]=1 -done -for key in "${!ARCHIVE_CAT[@]}"; do - ALL_REPOS["$key"]=1 -done -for key in "${!PURGATORY[@]}"; do - ALL_REPOS["$key"]=1 -done - -log_info "Total unique repos: ${#ALL_REPOS[@]}" - -# ============================================================================ -# Phase 3: Classify each repo according to revised decision tree -# ============================================================================ - -log_info "Classifying repos..." - -# Counters for summary -declare -A COUNTS -COUNTS[ready_complete_both]=0 -COUNTS[ready_deleted]=0 -COUNTS[ready_empty_prod]=0 -COUNTS[ready_archive_only]=0 -COUNTS[ready_not_in_prod]=0 -COUNTS[ready_archive_ahead]=0 -COUNTS[resync_missing_archive]=0 -COUNTS[resync_incomplete_archive]=0 -COUNTS[review_partial_prod]=0 -COUNTS[review_nomatch_prod]=0 -COUNTS[review_parse_failure]=0 -COUNTS[review_conflicting]=0 -COUNTS[review_diverged]=0 - -# Output arrays -declare -a READY_LINES -declare -a RESYNC_LINES -declare -a REVIEW_LINES - -# Helper function to get context string -get_context() { - local key="$1" - local prod_status="$2" - local archive_status="$3" - local context="" - - # Check purgatory - if [[ -n "${PURGATORY[$key]:-}" ]]; then - context="purgatory-expired" - fi - - # Check parse failure - if [[ -n "${PARSE_FAIL[$key]:-}" ]]; then - if [[ -n "$context" ]]; then - context="$context, parse-failure" - else - context="parse-failure" - fi - fi - - # Add archive context for unexpected states - if [[ "$prod_status" == "empty" && "$archive_status" != "missing" && "$archive_status" != "empty" ]]; then - if [[ -n "$context" ]]; then - context="$context, archive-has-data" - else - context="archive-has-data" - fi - fi - - echo "${context:-none}" -} - -# Helper to convert category to human-readable status -cat_to_status() { - case "$1" in - cat1) echo "complete" ;; - cat2) echo "empty" ;; - cat3) echo "partial" ;; - cat4) echo "no-match" ;; - missing) echo "missing" ;; - *) echo "$1" ;; - esac -} - -LOOP_COUNT=0 -for key in "${!ALL_REPOS[@]}"; do - LOOP_COUNT=$((LOOP_COUNT + 1)) - [[ $((LOOP_COUNT % 100)) -eq 0 ]] && log_info "Processed $LOOP_COUNT repos..." - IFS='|' read -r repo npub <<< "$key" - - prod_cat="${PROD_CAT[$key]:-missing}" - archive_cat="${ARCHIVE_CAT[$key]:-missing}" - prod_status=$(cat_to_status "$prod_cat") - archive_status=$(cat_to_status "$archive_cat") - - # Decision tree implementation - - # 1. Is there a kind 5 deletion event? - if [[ -n "${DELETED[$key]:-}" ]]; then - context=$(get_context "$key" "$prod_status" "$archive_status") - READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | deleted by user") - COUNTS[ready_deleted]=$((COUNTS[ready_deleted] + 1)) - continue - fi - - # 2. What is the prod status? - case "$prod_cat" in - missing) - # Not in prod - if [[ "$archive_cat" != "missing" ]]; then - # In archive but not in prod -> no action (archive-only) - context=$(get_context "$key" "$prod_status" "$archive_status") - READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | archive-only (not in prod)") - COUNTS[ready_archive_only]=$((COUNTS[ready_archive_only] + 1)) - elif [[ -n "${PURGATORY[$key]:-}" ]]; then - # Purgatory only, not in prod -> no action - context="purgatory-expired" - READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | purgatory-only (not in prod)") - COUNTS[ready_not_in_prod]=$((COUNTS[ready_not_in_prod] + 1)) - fi - # Otherwise skip (not a real repo - no data anywhere) - ;; - - cat2) - # Empty in prod -> ALWAYS no action required - context=$(get_context "$key" "$prod_status" "$archive_status") - READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | empty in prod (user never pushed)") - COUNTS[ready_empty_prod]=$((COUNTS[ready_empty_prod] + 1)) - ;; - - cat1) - # Complete in prod - if [[ "$archive_cat" == "cat1" ]]; then - # Complete in both -> no action - context=$(get_context "$key" "$prod_status" "$archive_status") - READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | complete in both") - COUNTS[ready_complete_both]=$((COUNTS[ready_complete_both] + 1)) - else - # Complete in prod, missing/incomplete in archive - # Check for parse failure - if so, needs manual review - if [[ -n "${PARSE_FAIL[$key]:-}" ]]; then - context=$(get_context "$key" "$prod_status" "$archive_status") - REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | complete in prod with parse failure") - COUNTS[review_parse_failure]=$((COUNTS[review_parse_failure] + 1)) - else - # Check git ancestry to see if archive is actually ahead - git_relationship="${GIT_ANCESTRY[$key]:-unknown}" - - if [[ "$git_relationship" == "archive-ahead" || "$git_relationship" == "in-sync" ]]; then - # Archive has newer/same git data - this is GOOD - # Archive's git data was authorized by a state event (GRASP enforced) - context=$(get_context "$key" "$prod_status" "$archive_status") - if [[ -n "$context" && "$context" != "none" ]]; then - context="$context, git=$git_relationship" - else - context="git=$git_relationship" - fi - READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | archive ahead (use archive data)") - COUNTS[ready_archive_ahead]=$((COUNTS[ready_archive_ahead] + 1)) - elif [[ "$git_relationship" == "diverged" ]]; then - # Git histories diverged - needs manual review - context=$(get_context "$key" "$prod_status" "$archive_status") - if [[ -n "$context" && "$context" != "none" ]]; then - context="$context, git=diverged" - else - context="git=diverged" - fi - REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | git histories diverged (manual review)") - COUNTS[review_diverged]=$((COUNTS[review_diverged] + 1)) - else - # prod-ahead, archive-only, prod-only, both-empty, or unknown - # These need resync - include purgatory context - context=$(get_context "$key" "$prod_status" "$archive_status") - if [[ "$git_relationship" != "unknown" ]]; then - if [[ -n "$context" && "$context" != "none" ]]; then - context="$context, git=$git_relationship" - else - context="git=$git_relationship" - fi - fi - if [[ "$archive_cat" == "missing" ]]; then - RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync to archive") - COUNTS[resync_missing_archive]=$((COUNTS[resync_missing_archive] + 1)) - else - RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync (archive incomplete)") - COUNTS[resync_incomplete_archive]=$((COUNTS[resync_incomplete_archive] + 1)) - fi - fi - fi - fi - ;; - - cat3) - # Partial in prod -> ALWAYS manual investigation - context=$(get_context "$key" "$prod_status" "$archive_status") - REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | partial in prod (investigate git data)") - COUNTS[review_partial_prod]=$((COUNTS[review_partial_prod] + 1)) - ;; - - cat4) - # No-match in prod -> ALWAYS manual investigation - context=$(get_context "$key" "$prod_status" "$archive_status") - REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | no-match in prod (git corruption)") - COUNTS[review_nomatch_prod]=$((COUNTS[review_nomatch_prod] + 1)) - ;; - esac -done - -# ============================================================================ -# Phase 4: Write output files -# ============================================================================ - -log_info "Writing output files..." - -TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S+00:00") - -# Write ready-for-migration.txt -{ - echo "# Ready for Migration - No action required" - echo "# Generated: $TIMESTAMP" - echo "# Format: repo | npub | prod_status | archive_status | context | reason" - echo "#" - for line in "${READY_LINES[@]}"; do - echo "$line" - done -} > "$READY_FILE" - -# Write needs-resync.txt -{ - echo "# Needs Re-sync - Action required" - echo "# Generated: $TIMESTAMP" - echo "# Format: repo | npub | prod_status | archive_status | context | action" - echo "#" - echo "# Context meanings:" - echo "# purgatory-expired = archive tried to sync but failed (30min timeout)" - echo "# none = archive never tried or announcement missing" - echo "#" - for line in "${RESYNC_LINES[@]}"; do - echo "$line" - done -} > "$RESYNC_FILE" - -# Write manual-review.txt -{ - echo "# Manual Review Required - Investigation needed" - echo "# Generated: $TIMESTAMP" - echo "# Format: repo | npub | prod_status | archive_status | context | reason" - echo "#" - for line in "${REVIEW_LINES[@]}"; do - echo "$line" - done -} > "$REVIEW_FILE" - -# ============================================================================ -# Phase 5: Generate summary -# ============================================================================ - -log_info "Generating summary..." - -TOTAL_READY="${#READY_LINES[@]}" -TOTAL_RESYNC="${#RESYNC_LINES[@]}" -TOTAL_REVIEW="${#REVIEW_LINES[@]}" -TOTAL=$((TOTAL_READY + TOTAL_RESYNC + TOTAL_REVIEW)) - -# Calculate percentages -if [[ $TOTAL -gt 0 ]]; then - PCT_READY=$(awk "BEGIN {printf \"%.1f\", ($TOTAL_READY / $TOTAL) * 100}") - PCT_RESYNC=$(awk "BEGIN {printf \"%.1f\", ($TOTAL_RESYNC / $TOTAL) * 100}") - PCT_REVIEW=$(awk "BEGIN {printf \"%.1f\", ($TOTAL_REVIEW / $TOTAL) * 100}") -else - PCT_READY="0.0" - PCT_RESYNC="0.0" - PCT_REVIEW="0.0" -fi - -{ - echo "# Migration Classification Summary" - echo "Generated: $TIMESTAMP" - echo "Analysis Directory: $ANALYSIS_DIR" - echo "" - echo "## Overview" - echo "" - echo "| Category | Count | Percentage |" - echo "|----------|-------|------------|" - echo "| Ready for Migration | $TOTAL_READY | $PCT_READY% |" - echo "| Needs Re-sync | $TOTAL_RESYNC | $PCT_RESYNC% |" - echo "| Manual Review | $TOTAL_REVIEW | $PCT_REVIEW% |" - echo "| **Total** | **$TOTAL** | **100%** |" - echo "" - echo "## Tier 1: Ready for Migration ($TOTAL_READY repos)" - echo "" - echo "These repositories are ready for migration or don't need migration:" - echo "" - echo "| Reason | Count |" - echo "|--------|-------|" - echo "| complete in both prod and archive | ${COUNTS[ready_complete_both]} |" - echo "| archive ahead (has newer git data) | ${COUNTS[ready_archive_ahead]} |" - echo "| deleted by user | ${COUNTS[ready_deleted]} |" - echo "| empty in prod (user never pushed) | ${COUNTS[ready_empty_prod]} |" - echo "| archive-only (not in prod) | ${COUNTS[ready_archive_only]} |" - echo "| purgatory-only (not in prod) | ${COUNTS[ready_not_in_prod]} |" - echo "" - echo "## Tier 2: Needs Re-sync ($TOTAL_RESYNC repos)" - echo "" - echo "These repositories need re-sync to archive before migration:" - echo "" - echo "| Reason | Count | Action |" - echo "|--------|-------|--------|" - echo "| complete in prod, missing from archive | ${COUNTS[resync_missing_archive]} | trigger re-sync |" - echo "| complete in prod, incomplete in archive | ${COUNTS[resync_incomplete_archive]} | trigger re-sync |" - echo "" - echo "### Purgatory Context" - echo "" - echo "Repos in needs-resync.txt include purgatory context:" - echo "- **purgatory-expired**: Archive tried to sync but failed (30min timeout)" - echo "- **none**: Archive never tried or announcement missing" - echo "" - echo "## Tier 3: Manual Review ($TOTAL_REVIEW repos)" - echo "" - echo "These repositories require human investigation:" - echo "" - echo "| Reason | Count |" - echo "|--------|-------|" - echo "| partial in prod (cat3) | ${COUNTS[review_partial_prod]} |" - echo "| no-match in prod (cat4) | ${COUNTS[review_nomatch_prod]} |" - echo "| complete in prod with parse failure | ${COUNTS[review_parse_failure]} |" - echo "| git histories diverged | ${COUNTS[review_diverged]} |" - echo "" - echo "## Input Data Summary" - echo "" - echo "### Prod Categories" - echo "- Category 1 (complete): $(wc -l < "$PROD_DIR/category1-complete-match.txt")" - echo "- Category 2 (empty): $(wc -l < "$PROD_DIR/category2-empty-blank.txt")" - echo "- Category 3 (partial): $(wc -l < "$PROD_DIR/category3-partial-match.txt")" - echo "- Category 4 (no match): $(wc -l < "$PROD_DIR/category4-no-match.txt")" - echo "" - echo "### Archive Categories" - echo "- Category 1 (complete): $(wc -l < "$ARCHIVE_DIR/category1-complete-match.txt")" - echo "- Category 2 (empty): $(wc -l < "$ARCHIVE_DIR/category2-empty-blank.txt")" - echo "- Category 3 (partial): $(wc -l < "$ARCHIVE_DIR/category3-partial-match.txt")" - echo "- Category 4 (no match): $(wc -l < "$ARCHIVE_DIR/category4-no-match.txt")" - echo "" - echo "### Logs" - echo "- Parse failures: $(grep -c -v '^#' "$LOGS_DIR/parse-failures.txt" 2>/dev/null || echo 0)" - echo "- Purgatory expired: $(grep -c -v '^#' "$LOGS_DIR/purgatory-expired.txt" 2>/dev/null || echo 0)" - echo "" - echo "## Output Files" - echo "" - echo "- \`results/ready-for-migration.txt\` - $TOTAL_READY repos ready for migration" - echo "- \`results/needs-resync.txt\` - $TOTAL_RESYNC repos needing re-sync" - echo "- \`results/manual-review.txt\` - $TOTAL_REVIEW repos needing investigation" - echo "- \`results/summary.txt\` - This summary file" - echo "" - echo "## Recommended Next Steps" - echo "" - echo "1. **Review needs-resync.txt** - Trigger re-sync for these repos" - echo "2. **Review manual-review.txt** - Investigate unusual states" - echo "3. **Verify ready-for-migration.txt** - Spot-check a few repos" - echo "4. **Plan migration window** - Schedule cutover when action items resolved" -} > "$SUMMARY_FILE" - -# ============================================================================ -# Phase 6: Print summary to console -# ============================================================================ - -echo "" -log_success "Classification complete!" -echo "" -echo "=== Summary ===" -echo "Ready for Migration: $TOTAL_READY ($PCT_READY%)" -echo " - Complete in both: ${COUNTS[ready_complete_both]}" -echo " - Archive ahead: ${COUNTS[ready_archive_ahead]}" -echo " - Deleted by user: ${COUNTS[ready_deleted]}" -echo " - Empty in prod: ${COUNTS[ready_empty_prod]}" -echo " - Archive-only: ${COUNTS[ready_archive_only]}" -echo " - Purgatory-only: ${COUNTS[ready_not_in_prod]}" -echo "" -echo "Needs Re-sync: $TOTAL_RESYNC ($PCT_RESYNC%)" -echo " - Missing from archive: ${COUNTS[resync_missing_archive]}" -echo " - Incomplete in archive: ${COUNTS[resync_incomplete_archive]}" -echo "" -echo "Manual Review: $TOTAL_REVIEW ($PCT_REVIEW%)" -echo " - Partial in prod: ${COUNTS[review_partial_prod]}" -echo " - No-match in prod: ${COUNTS[review_nomatch_prod]}" -echo " - Parse failures: ${COUNTS[review_parse_failure]}" -echo " - Git diverged: ${COUNTS[review_diverged]}" -echo "" -echo "Total: $TOTAL repos" -echo "" -echo "Output files:" -echo " $READY_FILE" -echo " $RESYNC_FILE" -echo " $REVIEW_FILE" -echo " $SUMMARY_FILE" diff --git a/docs/how-to/migration-scripts/run-migration-analysis.sh b/docs/how-to/migration-scripts/run-migration-analysis.sh deleted file mode 100755 index acc5e44..0000000 --- a/docs/how-to/migration-scripts/run-migration-analysis.sh +++ /dev/null @@ -1,779 +0,0 @@ -#!/usr/bin/env bash -# -# run-migration-analysis.sh - Orchestrate the complete GRASP relay to ngit-grasp migration analysis -# -# This script runs all 5 phases of the migration analysis pipeline in sequence, -# with proper error handling, progress reporting, and timing information. -# -# QUICK START: -# # Basic usage (local analysis only - Phases 1, 3, 5) -# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev -# -# # Full analysis including git sync check (requires VPS access) -# ./run-migration-analysis.sh \ -# --prod-relay wss://relay.ngit.dev \ -# --archive-relay wss://archive.relay.ngit.dev \ -# --prod-git /var/lib/grasp-relay/git \ -# --archive-git /var/lib/ngit-grasp/git -# -# USAGE: -# ./run-migration-analysis.sh [options] -# -# REQUIRED OPTIONS: -# --prod-relay Production relay WebSocket URL (e.g., wss://relay.ngit.dev) -# --archive-relay Archive relay WebSocket URL (e.g., wss://archive.relay.ngit.dev) -# -# OPTIONAL OPTIONS: -# --prod-git Git base directory for prod (enables Phase 2) -# --archive-git Git base directory for archive (enables Phase 2) -# --service Systemd service name for log extraction (enables Phase 4) -# --output Output directory (default: work/migration-analysis-YYYYMMDD-HHMM) -# --since Start date for log extraction (default: 30 days ago) -# --until End date for log extraction (default: now) -# -# PHASE CONTROL: -# --skip-phase-1 Skip event fetching (use existing data) -# --skip-phase-2 Skip git sync check (use existing data) -# --skip-phase-3 Skip categorization (use existing data) -# --skip-phase-4 Skip log extraction (use existing data) -# --skip-phase-5 Skip final classification -# --only-phase-N Run only phase N (1-5) -# --from-phase-N Start from phase N (skip earlier phases) -# -# OTHER OPTIONS: -# --dry-run Show what would be executed without running -# --continue-on-error Continue to next phase even if current phase fails -# --help Show this help message -# -# PHASES: -# Phase 1: Fetch events from both relays (~30s each, local) -# Phase 2: Check git sync status (~20 min each, requires VPS) -# Phase 3: Categorize and compare results (fast, local) -# Phase 4: Extract logs from systemd (requires VPS) -# Phase 5: Final classification (fast, local) -# -# EXAMPLES: -# # Dry run to see what would happen -# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev --dry-run -# -# # Run only Phase 1 (fetch events) -# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev --only-phase-1 -# -# # Resume from Phase 3 using existing Phase 1-2 data -# ./run-migration-analysis.sh --prod-relay wss://relay.ngit.dev --archive-relay wss://archive.relay.ngit.dev --from-phase-3 --output work/migration-analysis-20260122-1430 -# -# # Full analysis on VPS with all features -# ./run-migration-analysis.sh \ -# --prod-relay wss://relay.ngit.dev \ -# --archive-relay wss://archive.relay.ngit.dev \ -# --prod-git /var/lib/grasp-relay/git \ -# --archive-git /var/lib/ngit-grasp/git \ -# --service ngit-grasp.service -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# - -set -euo pipefail - -# Get script directory for finding other scripts -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - BLUE='\033[0;34m' - CYAN='\033[0;36m' - BOLD='\033[1m' - NC='\033[0m' -else - RED='' - GREEN='' - YELLOW='' - BLUE='' - CYAN='' - BOLD='' - NC='' -fi - -# Logging functions -log_header() { - echo "" - echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════════${NC}" - echo -e "${BOLD}${CYAN} $*${NC}" - echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════════${NC}" - echo "" -} - -log_phase() { - echo "" - echo -e "${BOLD}${BLUE}┌──────────────────────────────────────────────────────────────┐${NC}" - echo -e "${BOLD}${BLUE}│ $*${NC}" - echo -e "${BOLD}${BLUE}└──────────────────────────────────────────────────────────────┘${NC}" -} - -log_info() { - echo -e "${BLUE}[INFO]${NC} $*" >&2 -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $*" >&2 -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" >&2 -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" >&2 -} - -log_step() { - echo -e "${CYAN} →${NC} $*" >&2 -} - -# Default values -PROD_RELAY="" -ARCHIVE_RELAY="" -PROD_GIT="" -ARCHIVE_GIT="" -SERVICE_NAME="" -OUTPUT_DIR="" -DRY_RUN=false -CONTINUE_ON_ERROR=false -LOG_SINCE="" -LOG_UNTIL="" - -# Phase control -SKIP_PHASE_1=false -SKIP_PHASE_2=false -SKIP_PHASE_3=false -SKIP_PHASE_4=false -SKIP_PHASE_5=false -ONLY_PHASE="" -FROM_PHASE="" - -# Timing -declare -A PHASE_TIMES - -usage() { - head -73 "$0" | tail -n +3 | sed 's/^# //' | sed 's/^#//' - exit 0 -} - -# Parse command line arguments -parse_args() { - while [[ $# -gt 0 ]]; do - case "$1" in - --prod-relay) - PROD_RELAY="$2" - shift 2 - ;; - --archive-relay) - ARCHIVE_RELAY="$2" - shift 2 - ;; - --prod-git) - PROD_GIT="$2" - shift 2 - ;; - --archive-git) - ARCHIVE_GIT="$2" - shift 2 - ;; - --service) - SERVICE_NAME="$2" - shift 2 - ;; - --output) - OUTPUT_DIR="$2" - shift 2 - ;; - --skip-phase-1) - SKIP_PHASE_1=true - shift - ;; - --skip-phase-2) - SKIP_PHASE_2=true - shift - ;; - --skip-phase-3) - SKIP_PHASE_3=true - shift - ;; - --skip-phase-4) - SKIP_PHASE_4=true - shift - ;; - --skip-phase-5) - SKIP_PHASE_5=true - shift - ;; - --only-phase-1|--only-phase-2|--only-phase-3|--only-phase-4|--only-phase-5) - ONLY_PHASE="${1#--only-phase-}" - shift - ;; - --from-phase-1|--from-phase-2|--from-phase-3|--from-phase-4|--from-phase-5) - FROM_PHASE="${1#--from-phase-}" - shift - ;; - --dry-run) - DRY_RUN=true - shift - ;; - --continue-on-error) - CONTINUE_ON_ERROR=true - shift - ;; - --since) - LOG_SINCE="$2" - shift 2 - ;; - --until) - LOG_UNTIL="$2" - shift 2 - ;; - --help|-h) - usage - ;; - *) - log_error "Unknown option: $1" - echo "Use --help for usage information." - exit 1 - ;; - esac - done -} - -# Validate required arguments -validate_args() { - local errors=0 - - if [[ -z "$PROD_RELAY" ]]; then - log_error "Missing required option: --prod-relay" - errors=1 - fi - - if [[ -z "$ARCHIVE_RELAY" ]]; then - log_error "Missing required option: --archive-relay" - errors=1 - fi - - # Validate relay URLs - if [[ -n "$PROD_RELAY" && ! "$PROD_RELAY" =~ ^wss?:// ]]; then - log_error "Invalid prod relay URL: $PROD_RELAY (must start with ws:// or wss://)" - errors=1 - fi - - if [[ -n "$ARCHIVE_RELAY" && ! "$ARCHIVE_RELAY" =~ ^wss?:// ]]; then - log_error "Invalid archive relay URL: $ARCHIVE_RELAY (must start with ws:// or wss://)" - errors=1 - fi - - # Validate git paths if provided - if [[ -n "$PROD_GIT" && ! -d "$PROD_GIT" ]]; then - log_warn "Prod git directory not found: $PROD_GIT" - log_warn "Phase 2 will fail unless running on VPS with access to this path." - fi - - if [[ -n "$ARCHIVE_GIT" && ! -d "$ARCHIVE_GIT" ]]; then - log_warn "Archive git directory not found: $ARCHIVE_GIT" - log_warn "Phase 2 will fail unless running on VPS with access to this path." - fi - - if [[ $errors -eq 1 ]]; then - echo "" - echo "Use --help for usage information." - exit 1 - fi -} - -# Check prerequisites -check_prerequisites() { - local missing=0 - - log_info "Checking prerequisites..." - - # Required tools - for tool in git nak jq awk sort; do - if command -v "$tool" &> /dev/null; then - log_step "$tool: found" - else - log_error "$tool: NOT FOUND" - missing=1 - fi - done - - # Optional tools - if command -v journalctl &> /dev/null; then - log_step "journalctl: found (Phase 4 available)" - else - log_step "journalctl: not found (Phase 4 will be skipped)" - SKIP_PHASE_4=true - fi - - if [[ $missing -eq 1 ]]; then - log_error "Missing required tools. Install them and try again." - exit 1 - fi - - # Check scripts exist - for script in 01-fetch-events.sh 10-check-git-sync.sh 20-categorize.sh 21-compare-relays.sh 22-compare-git-data.sh 30-extract-parse-failures.sh 31-extract-purgatory-expiry.sh 40-classify-actions.sh; do - if [[ ! -x "$SCRIPT_DIR/$script" ]]; then - log_error "Script not found or not executable: $SCRIPT_DIR/$script" - missing=1 - fi - done - - if [[ $missing -eq 1 ]]; then - exit 1 - fi - - log_success "All prerequisites satisfied" -} - -# Determine which phases to run -determine_phases() { - # Handle --only-phase-N - if [[ -n "$ONLY_PHASE" ]]; then - for i in 1 2 3 4 5; do - if [[ "$i" != "$ONLY_PHASE" ]]; then - eval "SKIP_PHASE_$i=true" - fi - done - fi - - # Handle --from-phase-N - if [[ -n "$FROM_PHASE" ]]; then - for i in 1 2 3 4 5; do - if [[ "$i" -lt "$FROM_PHASE" ]]; then - eval "SKIP_PHASE_$i=true" - fi - done - fi - - # Auto-skip Phase 2 if git paths not provided - if [[ -z "$PROD_GIT" && -z "$ARCHIVE_GIT" ]]; then - if [[ "$SKIP_PHASE_2" != "true" ]]; then - log_warn "No git paths provided. Phase 2 (git sync check) will be skipped." - log_warn "Use --prod-git and --archive-git to enable Phase 2." - SKIP_PHASE_2=true - fi - fi - - # Auto-skip Phase 4 if service not provided - if [[ -z "$SERVICE_NAME" ]]; then - if [[ "$SKIP_PHASE_4" != "true" ]]; then - log_warn "No service name provided. Phase 4 (log extraction) will be skipped." - log_warn "Use --service to enable Phase 4." - SKIP_PHASE_4=true - fi - fi -} - -# Setup output directory -setup_output_dir() { - if [[ -z "$OUTPUT_DIR" ]]; then - OUTPUT_DIR="work/migration-analysis-$(date +%Y%m%d-%H%M)" - fi - - log_info "Output directory: $OUTPUT_DIR" - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY RUN] Would create directory structure" - return - fi - - mkdir -p "$OUTPUT_DIR"/{prod/raw,archive/raw,comparison,logs,results} - - # Save configuration - cat > "$OUTPUT_DIR/config.txt" << EOF -# Migration Analysis Configuration -# Generated: $(date -Iseconds) - -PROD_RELAY=$PROD_RELAY -ARCHIVE_RELAY=$ARCHIVE_RELAY -PROD_GIT=$PROD_GIT -ARCHIVE_GIT=$ARCHIVE_GIT -SERVICE_NAME=$SERVICE_NAME -OUTPUT_DIR=$OUTPUT_DIR -EOF - - log_success "Created output directory structure" -} - -# Run a phase with timing and error handling -run_phase() { - local phase_num="$1" - local phase_name="$2" - shift 2 - local cmd=("$@") - - local skip_var="SKIP_PHASE_$phase_num" - if [[ "${!skip_var}" == "true" ]]; then - log_phase "Phase $phase_num: $phase_name [SKIPPED]" - return 0 - fi - - log_phase "Phase $phase_num: $phase_name" - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY RUN] Would execute:" - for c in "${cmd[@]}"; do - echo " $c" - done - return 0 - fi - - local start_time - start_time=$(date +%s) - - local exit_code=0 - - # Execute the command(s) - for c in "${cmd[@]}"; do - log_step "Running: $c" - if ! eval "$c"; then - exit_code=1 - if [[ "$CONTINUE_ON_ERROR" == "true" ]]; then - log_warn "Command failed, continuing due to --continue-on-error" - else - log_error "Command failed" - break - fi - fi - done - - local end_time - end_time=$(date +%s) - local duration=$((end_time - start_time)) - PHASE_TIMES[$phase_num]=$duration - - if [[ $exit_code -eq 0 ]]; then - log_success "Phase $phase_num completed in ${duration}s" - else - log_error "Phase $phase_num failed after ${duration}s" - if [[ "$CONTINUE_ON_ERROR" != "true" ]]; then - return 1 - fi - fi - - return $exit_code -} - -# Phase 1: Fetch events -run_phase_1() { - local cmds=() - - # Fetch from prod relay - cmds+=("'$SCRIPT_DIR/01-fetch-events.sh' '$PROD_RELAY' '$OUTPUT_DIR/prod'") - - # Fetch from archive relay - cmds+=("'$SCRIPT_DIR/01-fetch-events.sh' '$ARCHIVE_RELAY' '$OUTPUT_DIR/archive'") - - run_phase 1 "Fetch Events (~30s each)" "${cmds[@]}" -} - -# Phase 2: Git sync check -run_phase_2() { - local cmds=() - - if [[ -n "$PROD_GIT" ]]; then - cmds+=("'$SCRIPT_DIR/10-check-git-sync.sh' '$OUTPUT_DIR/prod/raw/state-events.json' '$PROD_GIT' '$OUTPUT_DIR/prod' --categorize") - else - log_warn "Skipping prod git sync check (no --prod-git provided)" - fi - - if [[ -n "$ARCHIVE_GIT" ]]; then - cmds+=("'$SCRIPT_DIR/10-check-git-sync.sh' '$OUTPUT_DIR/archive/raw/state-events.json' '$ARCHIVE_GIT' '$OUTPUT_DIR/archive' --categorize") - else - log_warn "Skipping archive git sync check (no --archive-git provided)" - fi - - if [[ ${#cmds[@]} -eq 0 ]]; then - log_warn "No git paths provided, skipping Phase 2" - return 0 - fi - - run_phase 2 "Git Sync Check (~20 min each)" "${cmds[@]}" -} - -# Phase 3: Categorize and compare -run_phase_3() { - local cmds=() - - # Check if we have git-sync-status.tsv files (from Phase 2) - # If not, we can't run categorization - local has_prod_sync=false - local has_archive_sync=false - - if [[ -f "$OUTPUT_DIR/prod/git-sync-status.tsv" ]]; then - has_prod_sync=true - fi - - if [[ -f "$OUTPUT_DIR/archive/git-sync-status.tsv" ]]; then - has_archive_sync=true - fi - - # Run categorization if we have sync data but no category files - if [[ "$has_prod_sync" == "true" && ! -f "$OUTPUT_DIR/prod/category1-complete-match.txt" ]]; then - cmds+=("'$SCRIPT_DIR/20-categorize.sh' '$OUTPUT_DIR/prod/git-sync-status.tsv' '$OUTPUT_DIR/prod'") - fi - - if [[ "$has_archive_sync" == "true" && ! -f "$OUTPUT_DIR/archive/category1-complete-match.txt" ]]; then - cmds+=("'$SCRIPT_DIR/20-categorize.sh' '$OUTPUT_DIR/archive/git-sync-status.tsv' '$OUTPUT_DIR/archive'") - fi - - # Run comparison if we have category files - if [[ -f "$OUTPUT_DIR/prod/category1-complete-match.txt" && -f "$OUTPUT_DIR/archive/category1-complete-match.txt" ]]; then - cmds+=("'$SCRIPT_DIR/21-compare-relays.sh' '$OUTPUT_DIR/prod' '$OUTPUT_DIR/archive' '$OUTPUT_DIR/comparison'") - else - log_warn "Missing category files for comparison." - log_warn "Phase 2 must complete successfully before Phase 3 can compare relays." - - # Create placeholder comparison files if they don't exist - if [[ "$DRY_RUN" != "true" ]]; then - mkdir -p "$OUTPUT_DIR/comparison" - for f in complete-in-both.txt complete-prod-missing-archive.txt complete-prod-incomplete-archive.txt incomplete-in-both.txt in-archive-not-prod.txt; do - if [[ ! -f "$OUTPUT_DIR/comparison/$f" ]]; then - echo "# Placeholder - Phase 2 data not available" > "$OUTPUT_DIR/comparison/$f" - fi - done - echo "# Comparison not available - Phase 2 data missing" > "$OUTPUT_DIR/comparison/summary.txt" - fi - fi - - if [[ ${#cmds[@]} -eq 0 ]]; then - log_warn "No categorization or comparison needed (already done or missing input)" - return 0 - fi - - run_phase 3 "Categorize & Compare (fast)" "${cmds[@]}" - - # Phase 3c: Compare git data between relays (requires git paths) - # This determines if archive is ahead of prod for repos with mismatched state - if [[ -n "$PROD_GIT" && -n "$ARCHIVE_GIT" ]]; then - # Build list of repos to compare: those where prod=complete but archive is not - local repos_to_compare="$OUTPUT_DIR/comparison/complete-prod-incomplete-archive.txt" - if [[ -f "$repos_to_compare" ]] && [[ ! -f "$OUTPUT_DIR/comparison/git-ancestry.tsv" ]]; then - log_info "Running git ancestry comparison (Phase 3c)..." - run_phase 3 "Git Ancestry Comparison" "'$SCRIPT_DIR/22-compare-git-data.sh' '$PROD_GIT' '$ARCHIVE_GIT' '$repos_to_compare' '$OUTPUT_DIR/comparison'" - fi - else - log_warn "Git paths not provided - skipping git ancestry comparison" - log_warn "Without git comparison, repos where archive is ahead will be incorrectly flagged as needing re-sync" - fi -} - -# Phase 4: Extract logs -run_phase_4() { - if [[ -z "$SERVICE_NAME" ]]; then - log_warn "No service name provided, skipping Phase 4" - return 0 - fi - - # Validate service name before running Phase 4 - # Structured logging only exists in ngit-grasp, not ngit-relay - if [[ "$SERVICE_NAME" == *"ngit-relay"* ]]; then - log_error "SERVICE_NAME appears to be ngit-relay: $SERVICE_NAME" - log_error "" - log_error "Phase 4 requires an ngit-grasp service with structured logging." - log_error "Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) only exists" - log_error "in ngit-grasp services, NOT in ngit-relay services." - log_error "" - log_error "Please update --service to use the ngit-grasp archive service." - log_error "" - log_error "To find the correct service name:" - log_error " systemctl list-units 'ngit-grasp*' --all" - log_error "" - log_error "Common ngit-grasp service names:" - log_error " - ngit-grasp.service" - log_error " - ngit-grasp-relay-ngit-dev.service (NixOS multi-instance)" - log_error " - ngit-grasp-archive.service" - return 1 - fi - - # Warn if service name doesn't look like ngit-grasp - if [[ "$SERVICE_NAME" != *"ngit-grasp"* && "$SERVICE_NAME" != *"grasp"* ]]; then - log_warn "SERVICE_NAME doesn't contain 'ngit-grasp': $SERVICE_NAME" - log_warn "Structured logging only exists in ngit-grasp services." - log_warn "If this is not an ngit-grasp service, Phase 4 will find no logs." - fi - - local cmds=() - - # Build log extraction options - local log_opts="" - if [[ -n "$LOG_SINCE" ]]; then - log_opts="$log_opts --since '$LOG_SINCE'" - fi - if [[ -n "$LOG_UNTIL" ]]; then - log_opts="$log_opts --until '$LOG_UNTIL'" - fi - - cmds+=("'$SCRIPT_DIR/30-extract-parse-failures.sh' '$SERVICE_NAME' '$OUTPUT_DIR/logs' $log_opts") - cmds+=("'$SCRIPT_DIR/31-extract-purgatory-expiry.sh' '$SERVICE_NAME' '$OUTPUT_DIR/logs' $log_opts") - - run_phase 4 "Extract Logs (VPS required)" "${cmds[@]}" -} - -# Phase 5: Final classification -run_phase_5() { - # Check if we have the minimum required files - local can_run=true - - if [[ ! -d "$OUTPUT_DIR/prod" ]]; then - log_warn "Missing prod directory" - can_run=false - fi - - if [[ ! -d "$OUTPUT_DIR/archive" ]]; then - log_warn "Missing archive directory" - can_run=false - fi - - if [[ ! -d "$OUTPUT_DIR/comparison" ]]; then - log_warn "Missing comparison directory" - can_run=false - fi - - # Create logs directory with empty files if missing - if [[ "$DRY_RUN" != "true" ]]; then - mkdir -p "$OUTPUT_DIR/logs" - for f in parse-failures.txt purgatory-expired.txt; do - if [[ ! -f "$OUTPUT_DIR/logs/$f" ]]; then - echo "# No data - Phase 4 not run" > "$OUTPUT_DIR/logs/$f" - fi - done - fi - - if [[ "$can_run" == "false" ]]; then - log_error "Cannot run Phase 5 - missing required input directories" - return 1 - fi - - run_phase 5 "Final Classification (fast)" "'$SCRIPT_DIR/40-classify-actions.sh' '$OUTPUT_DIR'" -} - -# Display summary -display_summary() { - log_header "Migration Analysis Complete" - - echo "Output Directory: $OUTPUT_DIR" - echo "" - - # Phase timing summary - echo "Phase Timing:" - local total_time=0 - for phase in 1 2 3 4 5; do - local skip_var="SKIP_PHASE_$phase" - if [[ "${!skip_var}" == "true" ]]; then - echo " Phase $phase: SKIPPED" - elif [[ -n "${PHASE_TIMES[$phase]:-}" ]]; then - local t="${PHASE_TIMES[$phase]}" - echo " Phase $phase: ${t}s" - total_time=$((total_time + t)) - else - echo " Phase $phase: N/A" - fi - done - echo " ─────────────" - echo " Total: ${total_time}s" - echo "" - - # Results summary - if [[ -f "$OUTPUT_DIR/results/summary.txt" ]]; then - echo "Results Summary:" - echo "" - # Extract key metrics from summary - if grep -q "No Action Required" "$OUTPUT_DIR/results/summary.txt"; then - grep -A1 "No Action Required" "$OUTPUT_DIR/results/summary.txt" | head -2 - fi - if grep -q "Action Required" "$OUTPUT_DIR/results/summary.txt"; then - grep -A1 "Action Required" "$OUTPUT_DIR/results/summary.txt" | head -2 - fi - if grep -q "Manual Investigation" "$OUTPUT_DIR/results/summary.txt"; then - grep -A1 "Manual Investigation" "$OUTPUT_DIR/results/summary.txt" | head -2 - fi - echo "" - fi - - # Output files - echo "Output Files:" - echo " $OUTPUT_DIR/results/no-action-required.txt" - echo " $OUTPUT_DIR/results/action-required.txt" - echo " $OUTPUT_DIR/results/manual-investigation.txt" - echo " $OUTPUT_DIR/results/summary.txt" - echo "" - - # Next steps - echo "Next Steps:" - echo " 1. Review results/summary.txt for overview" - echo " 2. Address items in results/action-required.txt" - echo " 3. Investigate items in results/manual-investigation.txt" - echo " 4. Plan migration window when action items are resolved" - echo "" -} - -# Main -main() { - parse_args "$@" - - log_header "GRASP Relay to ngit-grasp Migration Analysis" - - validate_args - check_prerequisites - determine_phases - setup_output_dir - - # Show configuration - log_info "Configuration:" - log_step "Prod relay: $PROD_RELAY" - log_step "Archive relay: $ARCHIVE_RELAY" - [[ -n "$PROD_GIT" ]] && log_step "Prod git: $PROD_GIT" - [[ -n "$ARCHIVE_GIT" ]] && log_step "Archive git: $ARCHIVE_GIT" - [[ -n "$SERVICE_NAME" ]] && log_step "Service: $SERVICE_NAME" - log_step "Output: $OUTPUT_DIR" - echo "" - - # Show phase plan - log_info "Phase Plan:" - for phase in 1 2 3 4 5; do - local skip_var="SKIP_PHASE_$phase" - if [[ "${!skip_var}" == "true" ]]; then - log_step "Phase $phase: SKIP" - else - log_step "Phase $phase: RUN" - fi - done - echo "" - - if [[ "$DRY_RUN" == "true" ]]; then - log_warn "DRY RUN MODE - No changes will be made" - echo "" - fi - - # Run phases - local overall_exit=0 - - run_phase_1 || overall_exit=1 - run_phase_2 || overall_exit=1 - run_phase_3 || overall_exit=1 - run_phase_4 || overall_exit=1 - run_phase_5 || overall_exit=1 - - # Display summary - if [[ "$DRY_RUN" != "true" ]]; then - display_summary - fi - - if [[ $overall_exit -ne 0 ]]; then - log_warn "Some phases failed. Review output for details." - fi - - exit $overall_exit -} - -main "$@" diff --git a/docs/how-to/migration-scripts/validate-service.sh b/docs/how-to/migration-scripts/validate-service.sh deleted file mode 100755 index 6988af3..0000000 --- a/docs/how-to/migration-scripts/validate-service.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env bash -# -# validate-service.sh - Validate service name for structured logging -# -# This helper script validates that a service name is appropriate for -# Phase 4 log extraction. Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) -# only exists in ngit-grasp services, NOT in ngit-relay services. -# -# USAGE: -# Source this script and call the validation function: -# -# source validate-service.sh -# validate_service_for_structured_logging "$SERVICE_NAME" || exit 1 -# -# BACKGROUND: -# Phase 4 of the migration analysis extracts structured log entries from -# journald. These log entries only exist in ngit-grasp services. If you -# accidentally specify an ngit-relay service, Phase 4 will find no logs -# and produce empty results. -# -# This validation prevents that common mistake by: -# 1. Checking if the service name contains "ngit-relay" (error) -# 2. Warning if the service name doesn't contain "ngit-grasp" -# 3. Optionally checking if structured logs actually exist -# -# SEE ALSO: -# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide -# 30-extract-parse-failures.sh - Uses this validation -# 31-extract-purgatory-expiry.sh - Uses this validation -# - -# Colors for output (disabled if not a terminal) -if [[ -t 1 ]]; then - _VS_RED='\033[0;31m' - _VS_YELLOW='\033[0;33m' - _VS_NC='\033[0m' -else - _VS_RED='' - _VS_YELLOW='' - _VS_NC='' -fi - -# Validates that the service name is appropriate for structured logging -# -# Arguments: -# $1 - service_name: The systemd service name to validate -# $2 - check_logs: Whether to check if logs actually exist (default: "true") -# $3 - interactive: Whether to prompt for confirmation (default: "true") -# -# Returns: -# 0 - Service is valid for structured logging -# 1 - Service is invalid or user declined to continue -# -# Example: -# validate_service_for_structured_logging "ngit-grasp.service" || exit 1 -# validate_service_for_structured_logging "ngit-grasp.service" "false" # Skip log check -# validate_service_for_structured_logging "ngit-grasp.service" "true" "false" # Non-interactive -# -validate_service_for_structured_logging() { - local service_name="$1" - local check_logs="${2:-true}" - local interactive="${3:-true}" - - # Check if service name looks like ngit-relay (ERROR - wrong service type) - if [[ "$service_name" == *"ngit-relay"* ]]; then - echo -e "${_VS_RED}ERROR: Service name appears to be ngit-relay: $service_name${_VS_NC}" >&2 - echo "" >&2 - echo "Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) only exists in" >&2 - echo "ngit-grasp services, NOT in ngit-relay services." >&2 - echo "" >&2 - echo "Please use the ngit-grasp archive service instead." >&2 - echo "" >&2 - echo "To find the correct service name:" >&2 - echo " systemctl list-units 'ngit-grasp*' --all" >&2 - echo "" >&2 - echo "Common ngit-grasp service names:" >&2 - echo " - ngit-grasp.service" >&2 - echo " - ngit-grasp-relay-ngit-dev.service (NixOS multi-instance)" >&2 - echo " - ngit-grasp-archive.service" >&2 - return 1 - fi - - # Check if service name looks like ngit-grasp (WARNING if not) - if [[ "$service_name" != *"ngit-grasp"* && "$service_name" != *"grasp"* ]]; then - echo -e "${_VS_YELLOW}WARNING: Service name doesn't contain 'ngit-grasp': $service_name${_VS_NC}" >&2 - echo "" >&2 - echo "Structured logging ([PARSE_FAIL], [PURGATORY_EXPIRED]) only exists in" >&2 - echo "ngit-grasp services." >&2 - echo "" >&2 - - if [[ "$interactive" == "true" ]]; then - read -p "Continue anyway? (y/N) " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - return 1 - fi - else - echo "Non-interactive mode: proceeding despite warning" >&2 - fi - fi - - # Optionally check if structured logs actually exist - if [[ "$check_logs" == "true" ]]; then - # Check if journalctl is available - if ! command -v journalctl &> /dev/null; then - echo -e "${_VS_YELLOW}WARNING: journalctl not available, cannot verify logs exist${_VS_NC}" >&2 - return 0 - fi - - # Check for structured log entries - # IMPORTANT: Use --no-pager to prevent hanging when run non-interactively (e.g., via SSH) - local has_parse_fail has_purgatory - has_parse_fail=$(journalctl --no-pager -u "$service_name" --since "7 days ago" 2>/dev/null | grep -c '\[PARSE_FAIL\]' || echo "0") - has_purgatory=$(journalctl --no-pager -u "$service_name" --since "7 days ago" 2>/dev/null | grep -c '\[PURGATORY_EXPIRED\]' || echo "0") - - # Strip any non-numeric characters (grep -c can have trailing whitespace) - has_parse_fail="${has_parse_fail//[^0-9]/}" - has_purgatory="${has_purgatory//[^0-9]/}" - has_parse_fail="${has_parse_fail:-0}" - has_purgatory="${has_purgatory:-0}" - - if [[ "$has_parse_fail" -eq 0 && "$has_purgatory" -eq 0 ]]; then - echo -e "${_VS_YELLOW}WARNING: No structured logs found in $service_name (last 7 days)${_VS_NC}" >&2 - echo "" >&2 - echo "This may indicate:" >&2 - echo " 1. Wrong service (should be ngit-grasp archive service, not ngit-relay)" >&2 - echo " 2. Structured logging not yet deployed to this ngit-grasp instance" >&2 - echo " 3. No parse failures or purgatory expiry events in the time window" >&2 - echo "" >&2 - echo "To verify you have the right service:" >&2 - echo " systemctl list-units 'ngit-grasp*' --all" >&2 - echo " journalctl -u | grep -E '\\[PARSE_FAIL\\]|\\[PURGATORY_EXPIRED\\]' | head -5" >&2 - echo "" >&2 - - if [[ "$interactive" == "true" ]]; then - read -p "Continue anyway? (y/N) " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - return 1 - fi - else - echo "Non-interactive mode: proceeding despite warning" >&2 - fi - fi - fi - - return 0 -} - -# Export the function so it can be used after sourcing -export -f validate_service_for_structured_logging -- cgit v1.2.3