From a5504395c946bdf28b5ad0e0148ff371ca33d4d3 Mon Sep 17 00:00:00 2001
From: DanConwayDev <DanConwayDev@protonmail.com>
Date: Fri, 23 Jan 2026 11:06:12 +0000
Subject: Add Phase 3 migration scripts for categorization and comparison

- 20-categorize.sh: Categorizes git sync status into 4 categories
- 21-compare-relays.sh: Compares prod vs archive to find gaps
- Updated how-to doc with detailed Phase 3 outputs and directory structure
- Tested with Jan 22 data: 231 complete in both, 276 complete in prod but missing from archive
---
 docs/how-to/migration-scripts/20-categorize.sh     | 212 +++++++++++++++
 docs/how-to/migration-scripts/21-compare-relays.sh | 294 +++++++++++++++++++++
 2 files changed, 506 insertions(+)
 create mode 100755 docs/how-to/migration-scripts/20-categorize.sh
 create mode 100755 docs/how-to/migration-scripts/21-compare-relays.sh

(limited to 'docs/how-to/migration-scripts')
diff --git a/docs/how-to/migration-scripts/20-categorize.sh b/docs/how-to/migration-scripts/20-categorize.sh
new file mode 100755
index 0000000..f47eb55
--- /dev/null
+++ b/docs/how-to/migration-scripts/20-categorize.sh
@@ -0,0 +1,212 @@
+#!/usr/bin/env bash
+#
+# 20-categorize.sh - Categorize git sync status into 4 categories
+#
+# PHASE 3a of the ngit-relay to ngit-grasp migration analysis pipeline.
+# Takes git-sync-status.tsv from Phase 2 and categorizes into 4 files.
+#
+# USAGE:
+#   ./20-categorize.sh <git-sync-status.tsv> <output-dir>
+#
+# EXAMPLES:
+#   ./20-categorize.sh output/prod/git-sync-status.tsv output/prod
+#   ./20-categorize.sh output/archive/git-sync-status.tsv output/archive
+#
+# INPUT FORMAT (git-sync-status.tsv):
+#   Tab-separated values with columns:
+#   repo<TAB>npub<TAB>state_refs<TAB>git_refs<TAB>matches<TAB>reason
+#
+#   Where reason is optional and can be: no_git_dir, empty_refs, no_state_refs
+#
+# OUTPUT:
+#   <output-dir>/category1-complete-match.txt  - All refs match perfectly
+#   <output-dir>/category2-empty-blank.txt     - No git data available
+#   <output-dir>/category3-partial-match.txt   - Some refs match
+#   <output-dir>/category4-no-match.txt        - Git exists but refs don't match
+#
+# OUTPUT FORMAT:
+#   repo | npub | state_refs=N | git_refs=N | matches=N [| reason=X]
+#
+# CATEGORIES:
+#   1. Complete Match: state_refs == git_refs == matches (all > 0)
+#   2. Empty/Blank: git_refs == 0 OR reason in (no_git_dir, empty_refs, no_state_refs)
+#   3. Partial Match: matches > 0 AND matches < state_refs
+#   4. No Match: git_refs > 0 AND matches == 0
+#
+# PREREQUISITES:
+#   - awk (standard Unix tool)
+#
+# RUNTIME: < 1 second (local processing only)
+#
+# SEE ALSO:
+#   docs/how-to/migrate-ngit-relay-to-ngit-grasp.md - Full migration guide
+#   10-check-git-sync.sh - Phase 2 script that produces input for this script
+#
+
+set -euo pipefail
+
+# Colors for output (disabled if not a terminal)
+if [[ -t 1 ]]; then
+    RED='\033[0;31m'
+    GREEN='\033[0;32m'
+    YELLOW='\033[0;33m'
+    BLUE='\033[0;34m'
+    NC='\033[0m'
+else
+    RED=''
+    GREEN=''
+    YELLOW=''
+    BLUE=''
+    NC=''
+fi
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $*" >&2
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $*" >&2
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $*" >&2
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $*" >&2
+}
+
+usage() {
+    echo "Usage: $0 <git-sync-status.tsv> <output-dir>"
+    echo ""
+    echo "Arguments:"
+    echo "  git-sync-status.tsv  TSV file from Phase 2 (10-check-git-sync.sh)"
+    echo "  output-dir           Directory to store categorized output"
+    echo ""
+    echo "Examples:"
+    echo "  $0 output/prod/git-sync-status.tsv output/prod"
+    echo "  $0 output/archive/git-sync-status.tsv output/archive"
+    echo ""
+    echo "Input format (TSV):"
+    echo "  repo<TAB>npub<TAB>state_refs<TAB>git_refs<TAB>matches<TAB>reason"
+    echo ""
+    echo "Output files:"
+    echo "  category1-complete-match.txt  - All refs match"
+    echo "  category2-empty-blank.txt     - No git data"
+    echo "  category3-partial-match.txt   - Some refs match"
+    echo "  category4-no-match.txt        - Git exists, refs don't match"
+    exit 1
+}
+
+# Main
+main() {
+    if [[ $# -ne 2 ]]; then
+        usage
+    fi
+
+    local input_file="$1"
+    local output_dir="$2"
+
+    # Validate input file
+    if [[ ! -f "$input_file" ]]; then
+        log_error "Input file not found: $input_file"
+        exit 1
+    fi
+
+    log_info "Categorizing git sync status"
+    log_info "Input: $input_file"
+    log_info "Output: $output_dir"
+
+    # Create output directory
+    mkdir -p "$output_dir"
+
+    # Output files
+    local cat1="$output_dir/category1-complete-match.txt"
+    local cat2="$output_dir/category2-empty-blank.txt"
+    local cat3="$output_dir/category3-partial-match.txt"
+    local cat4="$output_dir/category4-no-match.txt"
+
+    # Clear previous results
+    > "$cat1"
+    > "$cat2"
+    > "$cat3"
+    > "$cat4"
+
+    # Process input file with awk
+    # Input: repo<TAB>npub<TAB>state_refs<TAB>git_refs<TAB>matches<TAB>reason
+    awk -F'\t' -v cat1="$cat1" -v cat2="$cat2" -v cat3="$cat3" -v cat4="$cat4" '
+    BEGIN {
+        count1 = 0; count2 = 0; count3 = 0; count4 = 0
+    }
+    NR == 1 && /^repo/ { next }  # Skip header if present
+    NF >= 5 {
+        repo = $1
+        npub = $2
+        state_refs = int($3)
+        git_refs = int($4)
+        matches = int($5)
+        reason = (NF >= 6) ? $6 : ""
+
+        # Format output line
+        if (reason != "") {
+            line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches " | reason=" reason
+        } else {
+            line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches
+        }
+
+        # Categorize
+        if (reason == "no_git_dir" || reason == "empty_refs" || reason == "no_state_refs" || git_refs == 0) {
+            # Category 2: Empty/Blank
+            print line >> cat2
+            count2++
+        } else if (state_refs > 0 && state_refs == git_refs && matches == state_refs) {
+            # Category 1: Complete Match
+            print line >> cat1
+            count1++
+        } else if (matches > 0 && matches < state_refs) {
+            # Category 3: Partial Match
+            print line >> cat3
+            count3++
+        } else if (git_refs > 0 && matches == 0) {
+            # Category 4: No Match
+            print line >> cat4
+            count4++
+        } else if (matches > 0) {
+            # Edge case: matches > 0 but does not fit other categories
+            # This can happen when git_refs > state_refs but all state refs match
+            # Treat as partial match
+            print line >> cat3
+            count3++
+        } else {
+            # Fallback: treat as category 2 (empty/blank)
+            print line >> cat2
+            count2++
+        }
+    }
+    END {
+        total = count1 + count2 + count3 + count4
+        print "COUNTS:" count1 ":" count2 ":" count3 ":" count4 ":" total
+    }
+    ' "$input_file" 2>&1 | while IFS= read -r line; do
+        if [[ "$line" =~ ^COUNTS: ]]; then
+            # Parse counts from awk output
+            IFS=':' read -r _ c1 c2 c3 c4 total <<< "$line"
+            
+            echo ""
+            log_info "=== Categorization Summary ==="
+            log_info "Total entries: $total"
+            log_success "Category 1 (Complete Match): $c1"
+            log_warn "Category 2 (Empty/Blank): $c2"
+            log_warn "Category 3 (Partial Match): $c3"
+            log_error "Category 4 (No Match): $c4"
+            echo ""
+            log_info "Output files:"
+            echo "  $cat1"
+            echo "  $cat2"
+            echo "  $cat3"
+            echo "  $cat4"
+        fi
+    done
+}
+
+main "$@"
diff --git a/docs/how-to/migration-scripts/21-compare-relays.sh b/docs/how-to/migration-scripts/21-compare-relays.sh
new file mode 100755
index 0000000..6b40dc8
--- /dev/null
+++ b/docs/how-to/migration-scripts/21-compare-relays.sh
@@ -0,0 +1,294 @@
+#!/usr/bin/env bash
+#
+# 21-compare-relays.sh - Compare prod vs archive category files to find gaps
+#
+# PHASE 3b of the ngit-relay to ngit-grasp migration analysis pipeline.
+# Compares categorized output from prod and archive to identify:
+# - Repos complete in prod but missing/incomplete in archive
+# - Repos in archive but not in prod
+# - Status differences between relays
+#
+# USAGE:
+#   ./21-compare-relays.sh <prod-dir> <archive-dir> <output-dir>
+#
+# EXAMPLES:
+#   ./21-compare-relays.sh output/prod output/archive output/comparison
+#
+# INPUT:
+#   Both prod-dir and archive-dir must contain:
+#   - category1-complete-match.txt
+#   - category2-empty-blank.txt
+#   - category3-partial-match.txt
+#   - category4-no-match.txt
+#
+# OUTPUT:
+#   <output-dir>/complete-in-both.txt           - Repos complete in both relays (no action)
+#   <output-dir>/complete-prod-missing-archive.txt - Complete in prod, not in archive cat1
+#   <output-dir>/complete-prod-incomplete-archive.txt - Complete in prod, incomplete in archive
+#   <output-dir>/incomplete-in-both.txt         - Incomplete in both relays
+#   <output-dir>/in-archive-not-prod.txt        - In archive but not in prod
+#   <output-dir>/summary.txt                    - Human-readable summary
+#
+# OUTPUT FORMAT:
+#   Each file contains lines in the format:
+#   repo | npub | prod_status | archive_status
+#
+# PREREQUISITES:
+#   - awk, sort, comm (standard Unix tools)
+#
+# RUNTIME: < 1 second (local processing only)
+#
+# SEE ALSO:
+#   docs/how-to/migrate-ngit-relay-to-ngit-grasp.md - Full migration guide
+#   20-categorize.sh - Phase 3a script that produces input for this script
+#
+
+set -euo pipefail
+
+# Colors for output (disabled if not a terminal)
+if [[ -t 1 ]]; then
+    RED='\033[0;31m'
+    GREEN='\033[0;32m'
+    YELLOW='\033[0;33m'
+    BLUE='\033[0;34m'
+    NC='\033[0m'
+else
+    RED=''
+    GREEN=''
+    YELLOW=''
+    BLUE=''
+    NC=''
+fi
+
+log_info() {
+    echo -e "${BLUE}[INFO]${NC} $*" >&2
+}
+
+log_success() {
+    echo -e "${GREEN}[OK]${NC} $*" >&2
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $*" >&2
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $*" >&2
+}
+
+usage() {
+    echo "Usage: $0 <prod-dir> <archive-dir> <output-dir>"
+    echo ""
+    echo "Arguments:"
+    echo "  prod-dir     Directory containing prod category files"
+    echo "  archive-dir  Directory containing archive category files"
+    echo "  output-dir   Directory to store comparison results"
+    echo ""
+    echo "Examples:"
+    echo "  $0 output/prod output/archive output/comparison"
+    echo ""
+    echo "Required input files in each directory:"
+    echo "  category1-complete-match.txt"
+    echo "  category2-empty-blank.txt"
+    echo "  category3-partial-match.txt"
+    echo "  category4-no-match.txt"
+    exit 1
+}
+
+# Extract repo|npub key from category line
+# Input: "repo | npub | state_refs=N | ..."
+# Output: "repo|npub"
+extract_key() {
+    awk -F' \\| ' '{print $1 "|" $2}'
+}
+
+# Build lookup table from category files
+# Args: $1=directory, $2=output_file
+build_lookup() {
+    local dir="$1"
+    local output="$2"
+    
+    # Process all 4 category files
+    for cat in 1 2 3 4; do
+        local file="$dir/category${cat}-*.txt"
+        # shellcheck disable=SC2086
+        if ls $file 1>/dev/null 2>&1; then
+            # shellcheck disable=SC2086
+            cat $file | while IFS= read -r line; do
+                key=$(echo "$line" | extract_key)
+                echo "${key}|cat${cat}|${line}"
+            done
+        fi
+    done | sort -t'|' -k1,2 > "$output"
+}
+
+# Main
+main() {
+    if [[ $# -ne 3 ]]; then
+        usage
+    fi
+
+    local prod_dir="$1"
+    local archive_dir="$2"
+    local output_dir="$3"
+
+    # Validate input directories
+    for dir in "$prod_dir" "$archive_dir"; do
+        if [[ ! -d "$dir" ]]; then
+            log_error "Directory not found: $dir"
+            exit 1
+        fi
+        if [[ ! -f "$dir/category1-complete-match.txt" ]]; then
+            log_error "Missing category1-complete-match.txt in $dir"
+            exit 1
+        fi
+    done
+
+    log_info "Comparing relay categories"
+    log_info "Prod: $prod_dir"
+    log_info "Archive: $archive_dir"
+    log_info "Output: $output_dir"
+
+    # Create output directory
+    mkdir -p "$output_dir"
+
+    # Create temp files for processing
+    local tmp_dir
+    tmp_dir=$(mktemp -d)
+    # shellcheck disable=SC2064
+    trap "rm -rf '$tmp_dir'" EXIT
+
+    log_info "Building lookup tables..."
+
+    # Build lookup tables: key|category|full_line
+    build_lookup "$prod_dir" "$tmp_dir/prod_lookup.txt"
+    build_lookup "$archive_dir" "$tmp_dir/archive_lookup.txt"
+
+    # Extract just keys for comparison
+    cut -d'|' -f1,2 "$tmp_dir/prod_lookup.txt" | sort -u > "$tmp_dir/prod_keys.txt"
+    cut -d'|' -f1,2 "$tmp_dir/archive_lookup.txt" | sort -u > "$tmp_dir/archive_keys.txt"
+
+    log_info "Comparing categories..."
+
+    # Initialize output files
+    > "$output_dir/complete-in-both.txt"
+    > "$output_dir/complete-prod-missing-archive.txt"
+    > "$output_dir/complete-prod-incomplete-archive.txt"
+    > "$output_dir/incomplete-in-both.txt"
+    > "$output_dir/in-archive-not-prod.txt"
+
+    # Process prod category 1 (complete) entries
+    while IFS='|' read -r repo npub cat full_line; do
+        key="${repo}|${npub}"
+        
+        # Look up in archive
+        archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "")
+        
+        if [[ -z "$archive_entry" ]]; then
+            # Not in archive at all
+            echo "$repo | $npub | prod=complete | archive=missing" >> "$output_dir/complete-prod-missing-archive.txt"
+        else
+            archive_cat=$(echo "$archive_entry" | cut -d'|' -f3)
+            if [[ "$archive_cat" == "cat1" ]]; then
+                # Complete in both
+                echo "$repo | $npub | prod=complete | archive=complete" >> "$output_dir/complete-in-both.txt"
+            else
+                # Complete in prod, incomplete in archive
+                echo "$repo | $npub | prod=complete | archive=$archive_cat" >> "$output_dir/complete-prod-incomplete-archive.txt"
+            fi
+        fi
+    done < <(grep '|cat1|' "$tmp_dir/prod_lookup.txt" | sed 's/|cat1|/|cat1|/')
+
+    # Process prod categories 2-4 (incomplete) entries
+    for cat in cat2 cat3 cat4; do
+        while IFS='|' read -r repo npub _ full_line; do
+            key="${repo}|${npub}"
+            
+            # Look up in archive
+            archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "")
+            
+            if [[ -z "$archive_entry" ]]; then
+                # Incomplete in prod, missing in archive
+                echo "$repo | $npub | prod=$cat | archive=missing" >> "$output_dir/incomplete-in-both.txt"
+            else
+                archive_cat=$(echo "$archive_entry" | cut -d'|' -f3)
+                if [[ "$archive_cat" != "cat1" ]]; then
+                    # Incomplete in both
+                    echo "$repo | $npub | prod=$cat | archive=$archive_cat" >> "$output_dir/incomplete-in-both.txt"
+                fi
+                # If archive is complete but prod is not, that's unusual but not an error
+            fi
+        done < <(grep "|${cat}|" "$tmp_dir/prod_lookup.txt")
+    done
+
+    # Find entries in archive but not in prod
+    comm -23 "$tmp_dir/archive_keys.txt" "$tmp_dir/prod_keys.txt" | while IFS='|' read -r repo npub; do
+        key="${repo}|${npub}"
+        archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "")
+        archive_cat=$(echo "$archive_entry" | cut -d'|' -f3)
+        echo "$repo | $npub | prod=missing | archive=$archive_cat" >> "$output_dir/in-archive-not-prod.txt"
+    done
+
+    # Count results
+    local count_both count_missing count_incomplete count_both_incomplete count_archive_only
+    count_both=$(wc -l < "$output_dir/complete-in-both.txt" | tr -d ' ')
+    count_missing=$(wc -l < "$output_dir/complete-prod-missing-archive.txt" | tr -d ' ')
+    count_incomplete=$(wc -l < "$output_dir/complete-prod-incomplete-archive.txt" | tr -d ' ')
+    count_both_incomplete=$(wc -l < "$output_dir/incomplete-in-both.txt" | tr -d ' ')
+    count_archive_only=$(wc -l < "$output_dir/in-archive-not-prod.txt" | tr -d ' ')
+
+    # Generate summary
+    cat > "$output_dir/summary.txt" << EOF
+# Relay Comparison Summary
+Generated: $(date -Iseconds)
+
+## Input
+- Prod: $prod_dir
+- Archive: $archive_dir
+
+## Results
+
+### No Action Required
+- Complete in both relays: $count_both
+
+### Action/Decision Required
+- Complete in prod, MISSING from archive: $count_missing
+- Complete in prod, INCOMPLETE in archive: $count_incomplete
+- Incomplete in BOTH relays: $count_both_incomplete
+
+### For Reference
+- In archive but not in prod: $count_archive_only
+
+## Files
+- complete-in-both.txt: Repos successfully migrated (no action)
+- complete-prod-missing-archive.txt: Need investigation - why not in archive?
+- complete-prod-incomplete-archive.txt: Archive sync may still be in progress
+- incomplete-in-both.txt: Git data incomplete on both relays
+- in-archive-not-prod.txt: May be deleted from prod or new to archive
+
+## Next Steps
+1. Review complete-prod-missing-archive.txt - these repos need attention
+2. Check if archive sync is still running for incomplete entries
+3. Cross-reference with deletion events (kind 5) from Phase 1
+4. Use Phase 4 logs to understand parse failures and purgatory expiry
+EOF
+
+    # Display summary
+    echo ""
+    log_info "=== Comparison Summary ==="
+    log_success "Complete in both: $count_both (no action needed)"
+    log_error "Complete in prod, MISSING from archive: $count_missing"
+    log_warn "Complete in prod, incomplete in archive: $count_incomplete"
+    log_warn "Incomplete in both: $count_both_incomplete"
+    log_info "In archive only: $count_archive_only"
+    echo ""
+    log_info "Output files:"
+    echo "  $output_dir/complete-in-both.txt"
+    echo "  $output_dir/complete-prod-missing-archive.txt"
+    echo "  $output_dir/complete-prod-incomplete-archive.txt"
+    echo "  $output_dir/incomplete-in-both.txt"
+    echo "  $output_dir/in-archive-not-prod.txt"
+    echo "  $output_dir/summary.txt"
+}
+
+main "$@"
-- 
cgit v1.2.3