diff options
Diffstat (limited to 'docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh')
| -rwxr-xr-x | docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh | 390 |
1 files changed, 390 insertions, 0 deletions
diff --git a/docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh new file mode 100755 index 0000000..76521d4 --- /dev/null +++ b/docs/archive/2026-01-relay-ngit-dev-migration/scripts/22-compare-git-data.sh | |||
| @@ -0,0 +1,390 @@ | |||
| 1 | #!/usr/bin/env bash | ||
| 2 | # | ||
| 3 | # 22-compare-git-data.sh - Compare actual git data between prod and archive relays | ||
| 4 | # | ||
| 5 | # PHASE 3c of the GRASP relay to ngit-grasp migration analysis pipeline. | ||
| 6 | # Compares actual git commits between prod and archive to determine which is ahead. | ||
| 7 | # | ||
| 8 | # KEY INSIGHT: | ||
| 9 | # Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event. | ||
| 10 | # If archive has different/newer data than prod, it means: | ||
| 11 | # - A state event authorized those commits at some point | ||
| 12 | # - Archive is actually MORE up-to-date than prod | ||
| 13 | # - Migration should use archive data (it's already correct) | ||
| 14 | # | ||
| 15 | # USAGE: | ||
| 16 | # ./22-compare-git-data.sh <prod-git-base> <archive-git-base> <repo-list> <output-dir> | ||
| 17 | # | ||
| 18 | # EXAMPLES: | ||
| 19 | # ./22-compare-git-data.sh /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \ | ||
| 20 | # output/comparison/complete-prod-incomplete-archive.txt output/comparison | ||
| 21 | # | ||
| 22 | # INPUT: | ||
| 23 | # prod-git-base Base directory for prod git repos (e.g., /var/lib/grasp-relay/git) | ||
| 24 | # archive-git-base Base directory for archive git repos (e.g., /var/lib/ngit-grasp/git) | ||
| 25 | # repo-list File with repos to compare (format: "repo | npub | ...") | ||
| 26 | # | ||
| 27 | # OUTPUT: | ||
| 28 | # <output-dir>/git-ancestry.tsv - Tab-separated values: | ||
| 29 | # repo<TAB>npub<TAB>relationship<TAB>details | ||
| 30 | # | ||
| 31 | # Relationship values: | ||
| 32 | # archive-ahead - Archive has all prod commits plus more (GOOD - use archive) | ||
| 33 | # in-sync - Both have identical commits | ||
| 34 | # prod-ahead - Prod has commits archive is missing (needs re-sync) | ||
| 35 | # diverged - Both have unique commits (manual review) | ||
| 36 | # archive-only - Only archive has git data | ||
| 37 | # prod-only - Only prod has git data | ||
| 38 | # both-empty - Neither has git data | ||
| 39 | # | ||
| 40 | # PREREQUISITES: | ||
| 41 | # - git (for ref comparison) | ||
| 42 | # - Read access to both git directories (may need sudo) | ||
| 43 | # | ||
| 44 | # RUNTIME: Depends on number of repos to compare | ||
| 45 | # | ||
| 46 | # SEE ALSO: | ||
| 47 | # docs/how-to/migrate-to-ngit-grasp.md - Full migration guide | ||
| 48 | # 21-compare-relays.sh - Phase 3b script that identifies repos to compare | ||
| 49 | # | ||
| 50 | |||
| 51 | set -euo pipefail | ||
| 52 | |||
| 53 | # Colors for output (disabled if not a terminal) | ||
| 54 | if [[ -t 1 ]]; then | ||
| 55 | RED='\033[0;31m' | ||
| 56 | GREEN='\033[0;32m' | ||
| 57 | YELLOW='\033[0;33m' | ||
| 58 | BLUE='\033[0;34m' | ||
| 59 | NC='\033[0m' | ||
| 60 | else | ||
| 61 | RED='' | ||
| 62 | GREEN='' | ||
| 63 | YELLOW='' | ||
| 64 | BLUE='' | ||
| 65 | NC='' | ||
| 66 | fi | ||
| 67 | |||
| 68 | log_info() { | ||
| 69 | echo -e "${BLUE}[INFO]${NC} $*" >&2 | ||
| 70 | } | ||
| 71 | |||
| 72 | log_success() { | ||
| 73 | echo -e "${GREEN}[OK]${NC} $*" >&2 | ||
| 74 | } | ||
| 75 | |||
| 76 | log_warn() { | ||
| 77 | echo -e "${YELLOW}[WARN]${NC} $*" >&2 | ||
| 78 | } | ||
| 79 | |||
| 80 | log_error() { | ||
| 81 | echo -e "${RED}[ERROR]${NC} $*" >&2 | ||
| 82 | } | ||
| 83 | |||
| 84 | log_progress() { | ||
| 85 | echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2 | ||
| 86 | } | ||
| 87 | |||
| 88 | usage() { | ||
| 89 | echo "Usage: $0 <prod-git-base> <archive-git-base> <repo-list> <output-dir>" | ||
| 90 | echo "" | ||
| 91 | echo "Arguments:" | ||
| 92 | echo " prod-git-base Base directory for prod git repos" | ||
| 93 | echo " archive-git-base Base directory for archive git repos" | ||
| 94 | echo " repo-list File with repos to compare (format: 'repo | npub | ...')" | ||
| 95 | echo " output-dir Directory to store output files" | ||
| 96 | echo "" | ||
| 97 | echo "Examples:" | ||
| 98 | echo " $0 /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \\" | ||
| 99 | echo " output/comparison/complete-prod-incomplete-archive.txt output/comparison" | ||
| 100 | echo "" | ||
| 101 | echo "Output:" | ||
| 102 | echo " git-ancestry.tsv - TSV with: repo, npub, relationship, details" | ||
| 103 | exit 1 | ||
| 104 | } | ||
| 105 | |||
| 106 | # Get all branch refs from a git directory | ||
| 107 | # Args: $1=git_dir | ||
| 108 | # Returns: sorted list of "ref_name commit_hash" lines | ||
| 109 | get_git_refs() { | ||
| 110 | local git_dir="$1" | ||
| 111 | |||
| 112 | if [[ ! -d "$git_dir" ]]; then | ||
| 113 | return | ||
| 114 | fi | ||
| 115 | |||
| 116 | git --git-dir="$git_dir" show-ref --heads 2>/dev/null | sort || true | ||
| 117 | } | ||
| 118 | |||
| 119 | # Check if commit A is ancestor of commit B | ||
| 120 | # Args: $1=git_dir, $2=commit_a, $3=commit_b | ||
| 121 | # Returns: 0 if A is ancestor of B, 1 otherwise | ||
| 122 | is_ancestor() { | ||
| 123 | local git_dir="$1" | ||
| 124 | local commit_a="$2" | ||
| 125 | local commit_b="$3" | ||
| 126 | |||
| 127 | git --git-dir="$git_dir" merge-base --is-ancestor "$commit_a" "$commit_b" 2>/dev/null | ||
| 128 | } | ||
| 129 | |||
| 130 | # Compare git data between prod and archive for a single repo | ||
| 131 | # Args: $1=prod_git_dir, $2=archive_git_dir | ||
| 132 | # Returns: relationship string | ||
| 133 | compare_repo_git() { | ||
| 134 | local prod_git="$1" | ||
| 135 | local archive_git="$2" | ||
| 136 | |||
| 137 | local prod_exists=false | ||
| 138 | local archive_exists=false | ||
| 139 | |||
| 140 | [[ -d "$prod_git" ]] && prod_exists=true | ||
| 141 | [[ -d "$archive_git" ]] && archive_exists=true | ||
| 142 | |||
| 143 | # Handle cases where one or both don't exist | ||
| 144 | if [[ "$prod_exists" == "false" && "$archive_exists" == "false" ]]; then | ||
| 145 | echo "both-empty" | ||
| 146 | return | ||
| 147 | fi | ||
| 148 | |||
| 149 | if [[ "$prod_exists" == "false" ]]; then | ||
| 150 | echo "archive-only" | ||
| 151 | return | ||
| 152 | fi | ||
| 153 | |||
| 154 | if [[ "$archive_exists" == "false" ]]; then | ||
| 155 | echo "prod-only" | ||
| 156 | return | ||
| 157 | fi | ||
| 158 | |||
| 159 | # Both exist - get refs | ||
| 160 | local prod_refs archive_refs | ||
| 161 | prod_refs=$(get_git_refs "$prod_git") | ||
| 162 | archive_refs=$(get_git_refs "$archive_git") | ||
| 163 | |||
| 164 | # Handle empty refs | ||
| 165 | if [[ -z "$prod_refs" && -z "$archive_refs" ]]; then | ||
| 166 | echo "both-empty" | ||
| 167 | return | ||
| 168 | fi | ||
| 169 | |||
| 170 | if [[ -z "$prod_refs" ]]; then | ||
| 171 | echo "archive-only" | ||
| 172 | return | ||
| 173 | fi | ||
| 174 | |||
| 175 | if [[ -z "$archive_refs" ]]; then | ||
| 176 | echo "prod-only" | ||
| 177 | return | ||
| 178 | fi | ||
| 179 | |||
| 180 | # Compare refs - check if they're identical | ||
| 181 | if [[ "$prod_refs" == "$archive_refs" ]]; then | ||
| 182 | echo "in-sync" | ||
| 183 | return | ||
| 184 | fi | ||
| 185 | |||
| 186 | # Refs differ - need to check ancestry | ||
| 187 | # Strategy: For each branch, check if one is ancestor of the other | ||
| 188 | # If all archive branches are ahead of or equal to prod branches, archive is ahead | ||
| 189 | # If all prod branches are ahead of or equal to archive branches, prod is ahead | ||
| 190 | # Otherwise, they've diverged | ||
| 191 | |||
| 192 | local archive_ahead=true | ||
| 193 | local prod_ahead=true | ||
| 194 | local has_common_branch=false | ||
| 195 | |||
| 196 | # Create temporary file to use archive as reference repo for ancestry checks | ||
| 197 | # We need a repo that has both sets of commits to check ancestry | ||
| 198 | # Use archive since it's the target and should have the superset | ||
| 199 | |||
| 200 | # Check each prod branch against archive | ||
| 201 | while read -r prod_hash prod_ref; do | ||
| 202 | [[ -z "$prod_hash" ]] && continue | ||
| 203 | |||
| 204 | # Get the same branch from archive | ||
| 205 | local archive_hash | ||
| 206 | archive_hash=$(echo "$archive_refs" | grep " $prod_ref$" | awk '{print $1}' || echo "") | ||
| 207 | |||
| 208 | if [[ -z "$archive_hash" ]]; then | ||
| 209 | # Branch exists in prod but not archive - prod has something archive doesn't | ||
| 210 | # But this could be a deleted branch, so don't immediately say prod is ahead | ||
| 211 | continue | ||
| 212 | fi | ||
| 213 | |||
| 214 | has_common_branch=true | ||
| 215 | |||
| 216 | if [[ "$prod_hash" == "$archive_hash" ]]; then | ||
| 217 | # Same commit - neither ahead for this branch | ||
| 218 | continue | ||
| 219 | fi | ||
| 220 | |||
| 221 | # Different commits - check ancestry | ||
| 222 | # First, try to check if prod is ancestor of archive (archive ahead) | ||
| 223 | if is_ancestor "$archive_git" "$prod_hash" "$archive_hash" 2>/dev/null; then | ||
| 224 | # Prod commit is ancestor of archive commit - archive is ahead for this branch | ||
| 225 | prod_ahead=false | ||
| 226 | elif is_ancestor "$archive_git" "$archive_hash" "$prod_hash" 2>/dev/null; then | ||
| 227 | # Archive commit is ancestor of prod commit - prod is ahead for this branch | ||
| 228 | archive_ahead=false | ||
| 229 | else | ||
| 230 | # Neither is ancestor - diverged | ||
| 231 | archive_ahead=false | ||
| 232 | prod_ahead=false | ||
| 233 | fi | ||
| 234 | done <<< "$prod_refs" | ||
| 235 | |||
| 236 | # Also check for branches only in archive (archive has extra branches) | ||
| 237 | while read -r archive_hash archive_ref; do | ||
| 238 | [[ -z "$archive_hash" ]] && continue | ||
| 239 | |||
| 240 | local prod_hash | ||
| 241 | prod_hash=$(echo "$prod_refs" | grep " $archive_ref$" | awk '{print $1}' || echo "") | ||
| 242 | |||
| 243 | if [[ -z "$prod_hash" ]]; then | ||
| 244 | # Branch exists in archive but not prod - archive has something prod doesn't | ||
| 245 | # This means archive is ahead (has extra branches) | ||
| 246 | prod_ahead=false | ||
| 247 | fi | ||
| 248 | done <<< "$archive_refs" | ||
| 249 | |||
| 250 | # Determine final relationship | ||
| 251 | if [[ "$has_common_branch" == "false" ]]; then | ||
| 252 | # No common branches - completely different | ||
| 253 | echo "diverged" | ||
| 254 | return | ||
| 255 | fi | ||
| 256 | |||
| 257 | if [[ "$archive_ahead" == "true" && "$prod_ahead" == "false" ]]; then | ||
| 258 | echo "archive-ahead" | ||
| 259 | elif [[ "$prod_ahead" == "true" && "$archive_ahead" == "false" ]]; then | ||
| 260 | echo "prod-ahead" | ||
| 261 | elif [[ "$archive_ahead" == "true" && "$prod_ahead" == "true" ]]; then | ||
| 262 | # Both true means all common branches are identical | ||
| 263 | # But one might have extra branches | ||
| 264 | echo "in-sync" | ||
| 265 | else | ||
| 266 | echo "diverged" | ||
| 267 | fi | ||
| 268 | } | ||
| 269 | |||
| 270 | # Main | ||
| 271 | main() { | ||
| 272 | if [[ $# -ne 4 ]]; then | ||
| 273 | usage | ||
| 274 | fi | ||
| 275 | |||
| 276 | local prod_git_base="$1" | ||
| 277 | local archive_git_base="$2" | ||
| 278 | local repo_list="$3" | ||
| 279 | local output_dir="$4" | ||
| 280 | |||
| 281 | # Validate inputs | ||
| 282 | if [[ ! -d "$prod_git_base" ]]; then | ||
| 283 | log_error "Prod git base directory not found: $prod_git_base" | ||
| 284 | exit 1 | ||
| 285 | fi | ||
| 286 | |||
| 287 | if [[ ! -d "$archive_git_base" ]]; then | ||
| 288 | log_error "Archive git base directory not found: $archive_git_base" | ||
| 289 | exit 1 | ||
| 290 | fi | ||
| 291 | |||
| 292 | if [[ ! -f "$repo_list" ]]; then | ||
| 293 | log_error "Repo list file not found: $repo_list" | ||
| 294 | exit 1 | ||
| 295 | fi | ||
| 296 | |||
| 297 | log_info "=== Git Data Comparison ===" | ||
| 298 | log_info "Prod git base: $prod_git_base" | ||
| 299 | log_info "Archive git base: $archive_git_base" | ||
| 300 | log_info "Repo list: $repo_list" | ||
| 301 | log_info "Output: $output_dir" | ||
| 302 | log_info "Started: $(date)" | ||
| 303 | echo "" | ||
| 304 | |||
| 305 | # Create output directory | ||
| 306 | mkdir -p "$output_dir" | ||
| 307 | |||
| 308 | # Output file | ||
| 309 | local tsv_file="$output_dir/git-ancestry.tsv" | ||
| 310 | |||
| 311 | # Initialize TSV with header | ||
| 312 | echo -e "repo\tnpub\trelationship\tdetails" > "$tsv_file" | ||
| 313 | |||
| 314 | # Count repos | ||
| 315 | local total_repos | ||
| 316 | total_repos=$(grep -c -v '^#' "$repo_list" 2>/dev/null || echo "0") | ||
| 317 | log_info "Processing $total_repos repos..." | ||
| 318 | echo "" | ||
| 319 | |||
| 320 | # Counters | ||
| 321 | local count=0 | ||
| 322 | local count_archive_ahead=0 | ||
| 323 | local count_in_sync=0 | ||
| 324 | local count_prod_ahead=0 | ||
| 325 | local count_diverged=0 | ||
| 326 | local count_archive_only=0 | ||
| 327 | local count_prod_only=0 | ||
| 328 | local count_both_empty=0 | ||
| 329 | |||
| 330 | # Process each repo | ||
| 331 | while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do | ||
| 332 | # Skip comments and empty lines | ||
| 333 | [[ "$repo" =~ ^# ]] && continue | ||
| 334 | [[ -z "$repo" ]] && continue | ||
| 335 | |||
| 336 | # Clean up whitespace | ||
| 337 | repo="${repo// /}" | ||
| 338 | npub="${npub// /}" | ||
| 339 | |||
| 340 | [[ -z "$repo" || -z "$npub" ]] && continue | ||
| 341 | |||
| 342 | count=$((count + 1)) | ||
| 343 | |||
| 344 | # Build git paths | ||
| 345 | local prod_git="$prod_git_base/${npub}/${repo}.git" | ||
| 346 | local archive_git="$archive_git_base/${npub}/${repo}.git" | ||
| 347 | |||
| 348 | # Compare | ||
| 349 | local relationship details="" | ||
| 350 | relationship=$(compare_repo_git "$prod_git" "$archive_git") | ||
| 351 | |||
| 352 | # Count by relationship | ||
| 353 | case "$relationship" in | ||
| 354 | archive-ahead) count_archive_ahead=$((count_archive_ahead + 1)) ;; | ||
| 355 | in-sync) count_in_sync=$((count_in_sync + 1)) ;; | ||
| 356 | prod-ahead) count_prod_ahead=$((count_prod_ahead + 1)) ;; | ||
| 357 | diverged) count_diverged=$((count_diverged + 1)) ;; | ||
| 358 | archive-only) count_archive_only=$((count_archive_only + 1)) ;; | ||
| 359 | prod-only) count_prod_only=$((count_prod_only + 1)) ;; | ||
| 360 | both-empty) count_both_empty=$((count_both_empty + 1)) ;; | ||
| 361 | esac | ||
| 362 | |||
| 363 | # Output TSV line | ||
| 364 | printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$relationship" "$details" >> "$tsv_file" | ||
| 365 | |||
| 366 | # Progress indicator every 10 repos | ||
| 367 | if [[ $((count % 10)) -eq 0 ]]; then | ||
| 368 | log_progress "Processed $count/$total_repos repos..." | ||
| 369 | fi | ||
| 370 | done < "$repo_list" | ||
| 371 | |||
| 372 | # Clear progress line | ||
| 373 | echo "" >&2 | ||
| 374 | |||
| 375 | # Summary | ||
| 376 | echo "" | ||
| 377 | log_info "=== Comparison Summary ===" | ||
| 378 | log_success "Archive ahead (use archive data): $count_archive_ahead" | ||
| 379 | log_success "In sync: $count_in_sync" | ||
| 380 | log_warn "Prod ahead (needs re-sync): $count_prod_ahead" | ||
| 381 | log_error "Diverged (manual review): $count_diverged" | ||
| 382 | log_info "Archive only: $count_archive_only" | ||
| 383 | log_info "Prod only: $count_prod_only" | ||
| 384 | log_info "Both empty: $count_both_empty" | ||
| 385 | echo "" | ||
| 386 | log_info "Total: $count repos" | ||
| 387 | log_info "Output: $tsv_file" | ||
| 388 | } | ||
| 389 | |||
| 390 | main "$@" | ||