From a5504395c946bdf28b5ad0e0148ff371ca33d4d3 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Fri, 23 Jan 2026 11:06:12 +0000 Subject: Add Phase 3 migration scripts for categorization and comparison - 20-categorize.sh: Categorizes git sync status into 4 categories - 21-compare-relays.sh: Compares prod vs archive to find gaps - Updated how-to doc with detailed Phase 3 outputs and directory structure - Tested with Jan 22 data: 231 complete in both, 276 complete in prod but missing from archive --- docs/how-to/migration-scripts/20-categorize.sh | 212 +++++++++++++++ docs/how-to/migration-scripts/21-compare-relays.sh | 294 +++++++++++++++++++++ 2 files changed, 506 insertions(+) create mode 100755 docs/how-to/migration-scripts/20-categorize.sh create mode 100755 docs/how-to/migration-scripts/21-compare-relays.sh (limited to 'docs/how-to/migration-scripts') diff --git a/docs/how-to/migration-scripts/20-categorize.sh b/docs/how-to/migration-scripts/20-categorize.sh new file mode 100755 index 0000000..f47eb55 --- /dev/null +++ b/docs/how-to/migration-scripts/20-categorize.sh @@ -0,0 +1,212 @@ +#!/usr/bin/env bash +# +# 20-categorize.sh - Categorize git sync status into 4 categories +# +# PHASE 3a of the ngit-relay to ngit-grasp migration analysis pipeline. +# Takes git-sync-status.tsv from Phase 2 and categorizes into 4 files. +# +# USAGE: +# ./20-categorize.sh +# +# EXAMPLES: +# ./20-categorize.sh output/prod/git-sync-status.tsv output/prod +# ./20-categorize.sh output/archive/git-sync-status.tsv output/archive +# +# INPUT FORMAT (git-sync-status.tsv): +# Tab-separated values with columns: +# reponpubstate_refsgit_refsmatchesreason +# +# Where reason is optional and can be: no_git_dir, empty_refs, no_state_refs +# +# OUTPUT: +# /category1-complete-match.txt - All refs match perfectly +# /category2-empty-blank.txt - No git data available +# /category3-partial-match.txt - Some refs match +# /category4-no-match.txt - Git exists but refs don't match +# +# OUTPUT FORMAT: +# repo | npub | state_refs=N | git_refs=N | matches=N [| reason=X] +# +# CATEGORIES: +# 1. Complete Match: state_refs == git_refs == matches (all > 0) +# 2. Empty/Blank: git_refs == 0 OR reason in (no_git_dir, empty_refs, no_state_refs) +# 3. Partial Match: matches > 0 AND matches < state_refs +# 4. No Match: git_refs > 0 AND matches == 0 +# +# PREREQUISITES: +# - awk (standard Unix tool) +# +# RUNTIME: < 1 second (local processing only) +# +# SEE ALSO: +# docs/how-to/migrate-ngit-relay-to-ngit-grasp.md - Full migration guide +# 10-check-git-sync.sh - Phase 2 script that produces input for this script +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Arguments:" + echo " git-sync-status.tsv TSV file from Phase 2 (10-check-git-sync.sh)" + echo " output-dir Directory to store categorized output" + echo "" + echo "Examples:" + echo " $0 output/prod/git-sync-status.tsv output/prod" + echo " $0 output/archive/git-sync-status.tsv output/archive" + echo "" + echo "Input format (TSV):" + echo " reponpubstate_refsgit_refsmatchesreason" + echo "" + echo "Output files:" + echo " category1-complete-match.txt - All refs match" + echo " category2-empty-blank.txt - No git data" + echo " category3-partial-match.txt - Some refs match" + echo " category4-no-match.txt - Git exists, refs don't match" + exit 1 +} + +# Main +main() { + if [[ $# -ne 2 ]]; then + usage + fi + + local input_file="$1" + local output_dir="$2" + + # Validate input file + if [[ ! -f "$input_file" ]]; then + log_error "Input file not found: $input_file" + exit 1 + fi + + log_info "Categorizing git sync status" + log_info "Input: $input_file" + log_info "Output: $output_dir" + + # Create output directory + mkdir -p "$output_dir" + + # Output files + local cat1="$output_dir/category1-complete-match.txt" + local cat2="$output_dir/category2-empty-blank.txt" + local cat3="$output_dir/category3-partial-match.txt" + local cat4="$output_dir/category4-no-match.txt" + + # Clear previous results + > "$cat1" + > "$cat2" + > "$cat3" + > "$cat4" + + # Process input file with awk + # Input: reponpubstate_refsgit_refsmatchesreason + awk -F'\t' -v cat1="$cat1" -v cat2="$cat2" -v cat3="$cat3" -v cat4="$cat4" ' + BEGIN { + count1 = 0; count2 = 0; count3 = 0; count4 = 0 + } + NR == 1 && /^repo/ { next } # Skip header if present + NF >= 5 { + repo = $1 + npub = $2 + state_refs = int($3) + git_refs = int($4) + matches = int($5) + reason = (NF >= 6) ? $6 : "" + + # Format output line + if (reason != "") { + line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches " | reason=" reason + } else { + line = repo " | " npub " | state_refs=" state_refs " | git_refs=" git_refs " | matches=" matches + } + + # Categorize + if (reason == "no_git_dir" || reason == "empty_refs" || reason == "no_state_refs" || git_refs == 0) { + # Category 2: Empty/Blank + print line >> cat2 + count2++ + } else if (state_refs > 0 && state_refs == git_refs && matches == state_refs) { + # Category 1: Complete Match + print line >> cat1 + count1++ + } else if (matches > 0 && matches < state_refs) { + # Category 3: Partial Match + print line >> cat3 + count3++ + } else if (git_refs > 0 && matches == 0) { + # Category 4: No Match + print line >> cat4 + count4++ + } else if (matches > 0) { + # Edge case: matches > 0 but does not fit other categories + # This can happen when git_refs > state_refs but all state refs match + # Treat as partial match + print line >> cat3 + count3++ + } else { + # Fallback: treat as category 2 (empty/blank) + print line >> cat2 + count2++ + } + } + END { + total = count1 + count2 + count3 + count4 + print "COUNTS:" count1 ":" count2 ":" count3 ":" count4 ":" total + } + ' "$input_file" 2>&1 | while IFS= read -r line; do + if [[ "$line" =~ ^COUNTS: ]]; then + # Parse counts from awk output + IFS=':' read -r _ c1 c2 c3 c4 total <<< "$line" + + echo "" + log_info "=== Categorization Summary ===" + log_info "Total entries: $total" + log_success "Category 1 (Complete Match): $c1" + log_warn "Category 2 (Empty/Blank): $c2" + log_warn "Category 3 (Partial Match): $c3" + log_error "Category 4 (No Match): $c4" + echo "" + log_info "Output files:" + echo " $cat1" + echo " $cat2" + echo " $cat3" + echo " $cat4" + fi + done +} + +main "$@" diff --git a/docs/how-to/migration-scripts/21-compare-relays.sh b/docs/how-to/migration-scripts/21-compare-relays.sh new file mode 100755 index 0000000..6b40dc8 --- /dev/null +++ b/docs/how-to/migration-scripts/21-compare-relays.sh @@ -0,0 +1,294 @@ +#!/usr/bin/env bash +# +# 21-compare-relays.sh - Compare prod vs archive category files to find gaps +# +# PHASE 3b of the ngit-relay to ngit-grasp migration analysis pipeline. +# Compares categorized output from prod and archive to identify: +# - Repos complete in prod but missing/incomplete in archive +# - Repos in archive but not in prod +# - Status differences between relays +# +# USAGE: +# ./21-compare-relays.sh +# +# EXAMPLES: +# ./21-compare-relays.sh output/prod output/archive output/comparison +# +# INPUT: +# Both prod-dir and archive-dir must contain: +# - category1-complete-match.txt +# - category2-empty-blank.txt +# - category3-partial-match.txt +# - category4-no-match.txt +# +# OUTPUT: +# /complete-in-both.txt - Repos complete in both relays (no action) +# /complete-prod-missing-archive.txt - Complete in prod, not in archive cat1 +# /complete-prod-incomplete-archive.txt - Complete in prod, incomplete in archive +# /incomplete-in-both.txt - Incomplete in both relays +# /in-archive-not-prod.txt - In archive but not in prod +# /summary.txt - Human-readable summary +# +# OUTPUT FORMAT: +# Each file contains lines in the format: +# repo | npub | prod_status | archive_status +# +# PREREQUISITES: +# - awk, sort, comm (standard Unix tools) +# +# RUNTIME: < 1 second (local processing only) +# +# SEE ALSO: +# docs/how-to/migrate-ngit-relay-to-ngit-grasp.md - Full migration guide +# 20-categorize.sh - Phase 3a script that produces input for this script +# + +set -euo pipefail + +# Colors for output (disabled if not a terminal) +if [[ -t 1 ]]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + BLUE='' + NC='' +fi + +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" >&2 +} + +log_success() { + echo -e "${GREEN}[OK]${NC} $*" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Arguments:" + echo " prod-dir Directory containing prod category files" + echo " archive-dir Directory containing archive category files" + echo " output-dir Directory to store comparison results" + echo "" + echo "Examples:" + echo " $0 output/prod output/archive output/comparison" + echo "" + echo "Required input files in each directory:" + echo " category1-complete-match.txt" + echo " category2-empty-blank.txt" + echo " category3-partial-match.txt" + echo " category4-no-match.txt" + exit 1 +} + +# Extract repo|npub key from category line +# Input: "repo | npub | state_refs=N | ..." +# Output: "repo|npub" +extract_key() { + awk -F' \\| ' '{print $1 "|" $2}' +} + +# Build lookup table from category files +# Args: $1=directory, $2=output_file +build_lookup() { + local dir="$1" + local output="$2" + + # Process all 4 category files + for cat in 1 2 3 4; do + local file="$dir/category${cat}-*.txt" + # shellcheck disable=SC2086 + if ls $file 1>/dev/null 2>&1; then + # shellcheck disable=SC2086 + cat $file | while IFS= read -r line; do + key=$(echo "$line" | extract_key) + echo "${key}|cat${cat}|${line}" + done + fi + done | sort -t'|' -k1,2 > "$output" +} + +# Main +main() { + if [[ $# -ne 3 ]]; then + usage + fi + + local prod_dir="$1" + local archive_dir="$2" + local output_dir="$3" + + # Validate input directories + for dir in "$prod_dir" "$archive_dir"; do + if [[ ! -d "$dir" ]]; then + log_error "Directory not found: $dir" + exit 1 + fi + if [[ ! -f "$dir/category1-complete-match.txt" ]]; then + log_error "Missing category1-complete-match.txt in $dir" + exit 1 + fi + done + + log_info "Comparing relay categories" + log_info "Prod: $prod_dir" + log_info "Archive: $archive_dir" + log_info "Output: $output_dir" + + # Create output directory + mkdir -p "$output_dir" + + # Create temp files for processing + local tmp_dir + tmp_dir=$(mktemp -d) + # shellcheck disable=SC2064 + trap "rm -rf '$tmp_dir'" EXIT + + log_info "Building lookup tables..." + + # Build lookup tables: key|category|full_line + build_lookup "$prod_dir" "$tmp_dir/prod_lookup.txt" + build_lookup "$archive_dir" "$tmp_dir/archive_lookup.txt" + + # Extract just keys for comparison + cut -d'|' -f1,2 "$tmp_dir/prod_lookup.txt" | sort -u > "$tmp_dir/prod_keys.txt" + cut -d'|' -f1,2 "$tmp_dir/archive_lookup.txt" | sort -u > "$tmp_dir/archive_keys.txt" + + log_info "Comparing categories..." + + # Initialize output files + > "$output_dir/complete-in-both.txt" + > "$output_dir/complete-prod-missing-archive.txt" + > "$output_dir/complete-prod-incomplete-archive.txt" + > "$output_dir/incomplete-in-both.txt" + > "$output_dir/in-archive-not-prod.txt" + + # Process prod category 1 (complete) entries + while IFS='|' read -r repo npub cat full_line; do + key="${repo}|${npub}" + + # Look up in archive + archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") + + if [[ -z "$archive_entry" ]]; then + # Not in archive at all + echo "$repo | $npub | prod=complete | archive=missing" >> "$output_dir/complete-prod-missing-archive.txt" + else + archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) + if [[ "$archive_cat" == "cat1" ]]; then + # Complete in both + echo "$repo | $npub | prod=complete | archive=complete" >> "$output_dir/complete-in-both.txt" + else + # Complete in prod, incomplete in archive + echo "$repo | $npub | prod=complete | archive=$archive_cat" >> "$output_dir/complete-prod-incomplete-archive.txt" + fi + fi + done < <(grep '|cat1|' "$tmp_dir/prod_lookup.txt" | sed 's/|cat1|/|cat1|/') + + # Process prod categories 2-4 (incomplete) entries + for cat in cat2 cat3 cat4; do + while IFS='|' read -r repo npub _ full_line; do + key="${repo}|${npub}" + + # Look up in archive + archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") + + if [[ -z "$archive_entry" ]]; then + # Incomplete in prod, missing in archive + echo "$repo | $npub | prod=$cat | archive=missing" >> "$output_dir/incomplete-in-both.txt" + else + archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) + if [[ "$archive_cat" != "cat1" ]]; then + # Incomplete in both + echo "$repo | $npub | prod=$cat | archive=$archive_cat" >> "$output_dir/incomplete-in-both.txt" + fi + # If archive is complete but prod is not, that's unusual but not an error + fi + done < <(grep "|${cat}|" "$tmp_dir/prod_lookup.txt") + done + + # Find entries in archive but not in prod + comm -23 "$tmp_dir/archive_keys.txt" "$tmp_dir/prod_keys.txt" | while IFS='|' read -r repo npub; do + key="${repo}|${npub}" + archive_entry=$(grep "^${key}|" "$tmp_dir/archive_lookup.txt" 2>/dev/null | head -1 || echo "") + archive_cat=$(echo "$archive_entry" | cut -d'|' -f3) + echo "$repo | $npub | prod=missing | archive=$archive_cat" >> "$output_dir/in-archive-not-prod.txt" + done + + # Count results + local count_both count_missing count_incomplete count_both_incomplete count_archive_only + count_both=$(wc -l < "$output_dir/complete-in-both.txt" | tr -d ' ') + count_missing=$(wc -l < "$output_dir/complete-prod-missing-archive.txt" | tr -d ' ') + count_incomplete=$(wc -l < "$output_dir/complete-prod-incomplete-archive.txt" | tr -d ' ') + count_both_incomplete=$(wc -l < "$output_dir/incomplete-in-both.txt" | tr -d ' ') + count_archive_only=$(wc -l < "$output_dir/in-archive-not-prod.txt" | tr -d ' ') + + # Generate summary + cat > "$output_dir/summary.txt" << EOF +# Relay Comparison Summary +Generated: $(date -Iseconds) + +## Input +- Prod: $prod_dir +- Archive: $archive_dir + +## Results + +### No Action Required +- Complete in both relays: $count_both + +### Action/Decision Required +- Complete in prod, MISSING from archive: $count_missing +- Complete in prod, INCOMPLETE in archive: $count_incomplete +- Incomplete in BOTH relays: $count_both_incomplete + +### For Reference +- In archive but not in prod: $count_archive_only + +## Files +- complete-in-both.txt: Repos successfully migrated (no action) +- complete-prod-missing-archive.txt: Need investigation - why not in archive? +- complete-prod-incomplete-archive.txt: Archive sync may still be in progress +- incomplete-in-both.txt: Git data incomplete on both relays +- in-archive-not-prod.txt: May be deleted from prod or new to archive + +## Next Steps +1. Review complete-prod-missing-archive.txt - these repos need attention +2. Check if archive sync is still running for incomplete entries +3. Cross-reference with deletion events (kind 5) from Phase 1 +4. Use Phase 4 logs to understand parse failures and purgatory expiry +EOF + + # Display summary + echo "" + log_info "=== Comparison Summary ===" + log_success "Complete in both: $count_both (no action needed)" + log_error "Complete in prod, MISSING from archive: $count_missing" + log_warn "Complete in prod, incomplete in archive: $count_incomplete" + log_warn "Incomplete in both: $count_both_incomplete" + log_info "In archive only: $count_archive_only" + echo "" + log_info "Output files:" + echo " $output_dir/complete-in-both.txt" + echo " $output_dir/complete-prod-missing-archive.txt" + echo " $output_dir/complete-prod-incomplete-archive.txt" + echo " $output_dir/incomplete-in-both.txt" + echo " $output_dir/in-archive-not-prod.txt" + echo " $output_dir/summary.txt" +} + +main "$@" -- cgit v1.2.3