upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/docs/how-to/migration-scripts
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-28 14:17:30 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-28 14:17:30 +0000
commit3c1eda5fc9e660d40cadcdef8903aea986fe3242 (patch)
treec11f81ca30069f4deca24de7c9c47368733ab7b8 /docs/how-to/migration-scripts
parentefc3da477d4edb9d1334718e3e20d197ba711468 (diff)
feat(migration): detect when archive git data is ahead of prod
Add git ancestry comparison (22-compare-git-data.sh) to determine commit relationships between prod and archive repos. Repos where archive is ahead are now correctly classified as ready-for-migration since ngit-grasp only accepts git data authorized by state events. Previously, repos with different git data were flagged as needs-resync even when archive had newer/better data than prod.
Diffstat (limited to 'docs/how-to/migration-scripts')
-rwxr-xr-xdocs/how-to/migration-scripts/22-compare-git-data.sh390
-rwxr-xr-xdocs/how-to/migration-scripts/40-classify-actions.sh84
-rwxr-xr-xdocs/how-to/migration-scripts/run-migration-analysis.sh16
3 files changed, 481 insertions, 9 deletions
diff --git a/docs/how-to/migration-scripts/22-compare-git-data.sh b/docs/how-to/migration-scripts/22-compare-git-data.sh
new file mode 100755
index 0000000..76521d4
--- /dev/null
+++ b/docs/how-to/migration-scripts/22-compare-git-data.sh
@@ -0,0 +1,390 @@
1#!/usr/bin/env bash
2#
3# 22-compare-git-data.sh - Compare actual git data between prod and archive relays
4#
5# PHASE 3c of the GRASP relay to ngit-grasp migration analysis pipeline.
6# Compares actual git commits between prod and archive to determine which is ahead.
7#
8# KEY INSIGHT:
9# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event.
10# If archive has different/newer data than prod, it means:
11# - A state event authorized those commits at some point
12# - Archive is actually MORE up-to-date than prod
13# - Migration should use archive data (it's already correct)
14#
15# USAGE:
16# ./22-compare-git-data.sh <prod-git-base> <archive-git-base> <repo-list> <output-dir>
17#
18# EXAMPLES:
19# ./22-compare-git-data.sh /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \
20# output/comparison/complete-prod-incomplete-archive.txt output/comparison
21#
22# INPUT:
23# prod-git-base Base directory for prod git repos (e.g., /var/lib/grasp-relay/git)
24# archive-git-base Base directory for archive git repos (e.g., /var/lib/ngit-grasp/git)
25# repo-list File with repos to compare (format: "repo | npub | ...")
26#
27# OUTPUT:
28# <output-dir>/git-ancestry.tsv - Tab-separated values:
29# repo<TAB>npub<TAB>relationship<TAB>details
30#
31# Relationship values:
32# archive-ahead - Archive has all prod commits plus more (GOOD - use archive)
33# in-sync - Both have identical commits
34# prod-ahead - Prod has commits archive is missing (needs re-sync)
35# diverged - Both have unique commits (manual review)
36# archive-only - Only archive has git data
37# prod-only - Only prod has git data
38# both-empty - Neither has git data
39#
40# PREREQUISITES:
41# - git (for ref comparison)
42# - Read access to both git directories (may need sudo)
43#
44# RUNTIME: Depends on number of repos to compare
45#
46# SEE ALSO:
47# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide
48# 21-compare-relays.sh - Phase 3b script that identifies repos to compare
49#
50
51set -euo pipefail
52
53# Colors for output (disabled if not a terminal)
54if [[ -t 1 ]]; then
55 RED='\033[0;31m'
56 GREEN='\033[0;32m'
57 YELLOW='\033[0;33m'
58 BLUE='\033[0;34m'
59 NC='\033[0m'
60else
61 RED=''
62 GREEN=''
63 YELLOW=''
64 BLUE=''
65 NC=''
66fi
67
68log_info() {
69 echo -e "${BLUE}[INFO]${NC} $*" >&2
70}
71
72log_success() {
73 echo -e "${GREEN}[OK]${NC} $*" >&2
74}
75
76log_warn() {
77 echo -e "${YELLOW}[WARN]${NC} $*" >&2
78}
79
80log_error() {
81 echo -e "${RED}[ERROR]${NC} $*" >&2
82}
83
84log_progress() {
85 echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2
86}
87
88usage() {
89 echo "Usage: $0 <prod-git-base> <archive-git-base> <repo-list> <output-dir>"
90 echo ""
91 echo "Arguments:"
92 echo " prod-git-base Base directory for prod git repos"
93 echo " archive-git-base Base directory for archive git repos"
94 echo " repo-list File with repos to compare (format: 'repo | npub | ...')"
95 echo " output-dir Directory to store output files"
96 echo ""
97 echo "Examples:"
98 echo " $0 /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \\"
99 echo " output/comparison/complete-prod-incomplete-archive.txt output/comparison"
100 echo ""
101 echo "Output:"
102 echo " git-ancestry.tsv - TSV with: repo, npub, relationship, details"
103 exit 1
104}
105
106# Get all branch refs from a git directory
107# Args: $1=git_dir
108# Returns: sorted list of "ref_name commit_hash" lines
109get_git_refs() {
110 local git_dir="$1"
111
112 if [[ ! -d "$git_dir" ]]; then
113 return
114 fi
115
116 git --git-dir="$git_dir" show-ref --heads 2>/dev/null | sort || true
117}
118
119# Check if commit A is ancestor of commit B
120# Args: $1=git_dir, $2=commit_a, $3=commit_b
121# Returns: 0 if A is ancestor of B, 1 otherwise
122is_ancestor() {
123 local git_dir="$1"
124 local commit_a="$2"
125 local commit_b="$3"
126
127 git --git-dir="$git_dir" merge-base --is-ancestor "$commit_a" "$commit_b" 2>/dev/null
128}
129
130# Compare git data between prod and archive for a single repo
131# Args: $1=prod_git_dir, $2=archive_git_dir
132# Returns: relationship string
133compare_repo_git() {
134 local prod_git="$1"
135 local archive_git="$2"
136
137 local prod_exists=false
138 local archive_exists=false
139
140 [[ -d "$prod_git" ]] && prod_exists=true
141 [[ -d "$archive_git" ]] && archive_exists=true
142
143 # Handle cases where one or both don't exist
144 if [[ "$prod_exists" == "false" && "$archive_exists" == "false" ]]; then
145 echo "both-empty"
146 return
147 fi
148
149 if [[ "$prod_exists" == "false" ]]; then
150 echo "archive-only"
151 return
152 fi
153
154 if [[ "$archive_exists" == "false" ]]; then
155 echo "prod-only"
156 return
157 fi
158
159 # Both exist - get refs
160 local prod_refs archive_refs
161 prod_refs=$(get_git_refs "$prod_git")
162 archive_refs=$(get_git_refs "$archive_git")
163
164 # Handle empty refs
165 if [[ -z "$prod_refs" && -z "$archive_refs" ]]; then
166 echo "both-empty"
167 return
168 fi
169
170 if [[ -z "$prod_refs" ]]; then
171 echo "archive-only"
172 return
173 fi
174
175 if [[ -z "$archive_refs" ]]; then
176 echo "prod-only"
177 return
178 fi
179
180 # Compare refs - check if they're identical
181 if [[ "$prod_refs" == "$archive_refs" ]]; then
182 echo "in-sync"
183 return
184 fi
185
186 # Refs differ - need to check ancestry
187 # Strategy: For each branch, check if one is ancestor of the other
188 # If all archive branches are ahead of or equal to prod branches, archive is ahead
189 # If all prod branches are ahead of or equal to archive branches, prod is ahead
190 # Otherwise, they've diverged
191
192 local archive_ahead=true
193 local prod_ahead=true
194 local has_common_branch=false
195
196 # Create temporary file to use archive as reference repo for ancestry checks
197 # We need a repo that has both sets of commits to check ancestry
198 # Use archive since it's the target and should have the superset
199
200 # Check each prod branch against archive
201 while read -r prod_hash prod_ref; do
202 [[ -z "$prod_hash" ]] && continue
203
204 # Get the same branch from archive
205 local archive_hash
206 archive_hash=$(echo "$archive_refs" | grep " $prod_ref$" | awk '{print $1}' || echo "")
207
208 if [[ -z "$archive_hash" ]]; then
209 # Branch exists in prod but not archive - prod has something archive doesn't
210 # But this could be a deleted branch, so don't immediately say prod is ahead
211 continue
212 fi
213
214 has_common_branch=true
215
216 if [[ "$prod_hash" == "$archive_hash" ]]; then
217 # Same commit - neither ahead for this branch
218 continue
219 fi
220
221 # Different commits - check ancestry
222 # First, try to check if prod is ancestor of archive (archive ahead)
223 if is_ancestor "$archive_git" "$prod_hash" "$archive_hash" 2>/dev/null; then
224 # Prod commit is ancestor of archive commit - archive is ahead for this branch
225 prod_ahead=false
226 elif is_ancestor "$archive_git" "$archive_hash" "$prod_hash" 2>/dev/null; then
227 # Archive commit is ancestor of prod commit - prod is ahead for this branch
228 archive_ahead=false
229 else
230 # Neither is ancestor - diverged
231 archive_ahead=false
232 prod_ahead=false
233 fi
234 done <<< "$prod_refs"
235
236 # Also check for branches only in archive (archive has extra branches)
237 while read -r archive_hash archive_ref; do
238 [[ -z "$archive_hash" ]] && continue
239
240 local prod_hash
241 prod_hash=$(echo "$prod_refs" | grep " $archive_ref$" | awk '{print $1}' || echo "")
242
243 if [[ -z "$prod_hash" ]]; then
244 # Branch exists in archive but not prod - archive has something prod doesn't
245 # This means archive is ahead (has extra branches)
246 prod_ahead=false
247 fi
248 done <<< "$archive_refs"
249
250 # Determine final relationship
251 if [[ "$has_common_branch" == "false" ]]; then
252 # No common branches - completely different
253 echo "diverged"
254 return
255 fi
256
257 if [[ "$archive_ahead" == "true" && "$prod_ahead" == "false" ]]; then
258 echo "archive-ahead"
259 elif [[ "$prod_ahead" == "true" && "$archive_ahead" == "false" ]]; then
260 echo "prod-ahead"
261 elif [[ "$archive_ahead" == "true" && "$prod_ahead" == "true" ]]; then
262 # Both true means all common branches are identical
263 # But one might have extra branches
264 echo "in-sync"
265 else
266 echo "diverged"
267 fi
268}
269
270# Main
271main() {
272 if [[ $# -ne 4 ]]; then
273 usage
274 fi
275
276 local prod_git_base="$1"
277 local archive_git_base="$2"
278 local repo_list="$3"
279 local output_dir="$4"
280
281 # Validate inputs
282 if [[ ! -d "$prod_git_base" ]]; then
283 log_error "Prod git base directory not found: $prod_git_base"
284 exit 1
285 fi
286
287 if [[ ! -d "$archive_git_base" ]]; then
288 log_error "Archive git base directory not found: $archive_git_base"
289 exit 1
290 fi
291
292 if [[ ! -f "$repo_list" ]]; then
293 log_error "Repo list file not found: $repo_list"
294 exit 1
295 fi
296
297 log_info "=== Git Data Comparison ==="
298 log_info "Prod git base: $prod_git_base"
299 log_info "Archive git base: $archive_git_base"
300 log_info "Repo list: $repo_list"
301 log_info "Output: $output_dir"
302 log_info "Started: $(date)"
303 echo ""
304
305 # Create output directory
306 mkdir -p "$output_dir"
307
308 # Output file
309 local tsv_file="$output_dir/git-ancestry.tsv"
310
311 # Initialize TSV with header
312 echo -e "repo\tnpub\trelationship\tdetails" > "$tsv_file"
313
314 # Count repos
315 local total_repos
316 total_repos=$(grep -c -v '^#' "$repo_list" 2>/dev/null || echo "0")
317 log_info "Processing $total_repos repos..."
318 echo ""
319
320 # Counters
321 local count=0
322 local count_archive_ahead=0
323 local count_in_sync=0
324 local count_prod_ahead=0
325 local count_diverged=0
326 local count_archive_only=0
327 local count_prod_only=0
328 local count_both_empty=0
329
330 # Process each repo
331 while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do
332 # Skip comments and empty lines
333 [[ "$repo" =~ ^# ]] && continue
334 [[ -z "$repo" ]] && continue
335
336 # Clean up whitespace
337 repo="${repo// /}"
338 npub="${npub// /}"
339
340 [[ -z "$repo" || -z "$npub" ]] && continue
341
342 count=$((count + 1))
343
344 # Build git paths
345 local prod_git="$prod_git_base/${npub}/${repo}.git"
346 local archive_git="$archive_git_base/${npub}/${repo}.git"
347
348 # Compare
349 local relationship details=""
350 relationship=$(compare_repo_git "$prod_git" "$archive_git")
351
352 # Count by relationship
353 case "$relationship" in
354 archive-ahead) count_archive_ahead=$((count_archive_ahead + 1)) ;;
355 in-sync) count_in_sync=$((count_in_sync + 1)) ;;
356 prod-ahead) count_prod_ahead=$((count_prod_ahead + 1)) ;;
357 diverged) count_diverged=$((count_diverged + 1)) ;;
358 archive-only) count_archive_only=$((count_archive_only + 1)) ;;
359 prod-only) count_prod_only=$((count_prod_only + 1)) ;;
360 both-empty) count_both_empty=$((count_both_empty + 1)) ;;
361 esac
362
363 # Output TSV line
364 printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$relationship" "$details" >> "$tsv_file"
365
366 # Progress indicator every 10 repos
367 if [[ $((count % 10)) -eq 0 ]]; then
368 log_progress "Processed $count/$total_repos repos..."
369 fi
370 done < "$repo_list"
371
372 # Clear progress line
373 echo "" >&2
374
375 # Summary
376 echo ""
377 log_info "=== Comparison Summary ==="
378 log_success "Archive ahead (use archive data): $count_archive_ahead"
379 log_success "In sync: $count_in_sync"
380 log_warn "Prod ahead (needs re-sync): $count_prod_ahead"
381 log_error "Diverged (manual review): $count_diverged"
382 log_info "Archive only: $count_archive_only"
383 log_info "Prod only: $count_prod_only"
384 log_info "Both empty: $count_both_empty"
385 echo ""
386 log_info "Total: $count repos"
387 log_info "Output: $tsv_file"
388}
389
390main "$@"
diff --git a/docs/how-to/migration-scripts/40-classify-actions.sh b/docs/how-to/migration-scripts/40-classify-actions.sh
index b1348f8..07ae7c9 100755
--- a/docs/how-to/migration-scripts/40-classify-actions.sh
+++ b/docs/how-to/migration-scripts/40-classify-actions.sh
@@ -10,16 +10,25 @@
10# - Empty in prod (prod=cat2, any archive status) 10# - Empty in prod (prod=cat2, any archive status)
11# - Archive-only (archive=any, prod=missing) 11# - Archive-only (archive=any, prod=missing)
12# - Not in prod (purgatory-only, prod=missing) 12# - Not in prod (purgatory-only, prod=missing)
13# - Archive ahead (archive has newer git data than prod - GRASP enforced)
13# 14#
14# Tier 2: Action Required (needs-resync.txt) 15# Tier 2: Action Required (needs-resync.txt)
15# - Complete in prod, missing from archive (with purgatory context) 16# - Complete in prod, missing from archive (with purgatory context)
16# - Complete in prod, incomplete in archive (with purgatory context) 17# - Complete in prod, incomplete in archive AND prod is ahead (with purgatory context)
17# 18#
18# Tier 3: Manual Investigation (manual-review.txt) 19# Tier 3: Manual Investigation (manual-review.txt)
19# - Partial in prod (prod=cat3) 20# - Partial in prod (prod=cat3)
20# - No-match in prod (prod=cat4) 21# - No-match in prod (prod=cat4)
21# - Parse failures 22# - Parse failures
22# - Conflicting states 23# - Conflicting states
24# - Diverged git history (both have unique commits)
25#
26# KEY INSIGHT:
27# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event.
28# If archive has different/newer data than prod, it means:
29# - A state event authorized those commits at some point
30# - Archive is actually MORE up-to-date than prod
31# - Migration should use archive data (it's already correct)
23# 32#
24# Usage: ./40-classify-actions.sh <analysis-dir> 33# Usage: ./40-classify-actions.sh <analysis-dir>
25# 34#
@@ -231,6 +240,25 @@ DELETED_COUNT=0
231[[ ${#DELETED[@]} -gt 0 ]] && DELETED_COUNT=${#DELETED[@]} 240[[ ${#DELETED[@]} -gt 0 ]] && DELETED_COUNT=${#DELETED[@]}
232log_info "Loaded $DELETED_COUNT deletion entries" 241log_info "Loaded $DELETED_COUNT deletion entries"
233 242
243# Build git ancestry lookup: repo|npub -> relationship (archive-ahead, prod-ahead, diverged, etc.)
244# This data comes from 22-compare-git-data.sh which compares actual git commits
245declare -A GIT_ANCESTRY
246GIT_ANCESTRY_COUNT=0
247if [[ -f "$COMPARISON_DIR/git-ancestry.tsv" ]]; then
248 while IFS=$'\t' read -r repo npub relationship details || [[ -n "$repo" ]]; do
249 # Skip header and comments
250 [[ "$repo" == "repo" ]] && continue
251 [[ "$repo" =~ ^# ]] && continue
252 [[ -z "$repo" || -z "$npub" ]] && continue
253 GIT_ANCESTRY["$repo|$npub"]="$relationship"
254 GIT_ANCESTRY_COUNT=$((GIT_ANCESTRY_COUNT + 1))
255 done < "$COMPARISON_DIR/git-ancestry.tsv"
256 log_info "Loaded $GIT_ANCESTRY_COUNT git ancestry entries"
257else
258 log_warn "No git-ancestry.tsv found - will not check if archive is ahead of prod"
259 log_warn "Run 22-compare-git-data.sh to enable archive-ahead detection"
260fi
261
234# ============================================================================ 262# ============================================================================
235# Phase 2: Build unique repo list from all sources 263# Phase 2: Build unique repo list from all sources
236# ============================================================================ 264# ============================================================================
@@ -263,12 +291,14 @@ COUNTS[ready_deleted]=0
263COUNTS[ready_empty_prod]=0 291COUNTS[ready_empty_prod]=0
264COUNTS[ready_archive_only]=0 292COUNTS[ready_archive_only]=0
265COUNTS[ready_not_in_prod]=0 293COUNTS[ready_not_in_prod]=0
294COUNTS[ready_archive_ahead]=0
266COUNTS[resync_missing_archive]=0 295COUNTS[resync_missing_archive]=0
267COUNTS[resync_incomplete_archive]=0 296COUNTS[resync_incomplete_archive]=0
268COUNTS[review_partial_prod]=0 297COUNTS[review_partial_prod]=0
269COUNTS[review_nomatch_prod]=0 298COUNTS[review_nomatch_prod]=0
270COUNTS[review_parse_failure]=0 299COUNTS[review_parse_failure]=0
271COUNTS[review_conflicting]=0 300COUNTS[review_conflicting]=0
301COUNTS[review_diverged]=0
272 302
273# Output arrays 303# Output arrays
274declare -a READY_LINES 304declare -a READY_LINES
@@ -381,14 +411,48 @@ for key in "${!ALL_REPOS[@]}"; do
381 REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | complete in prod with parse failure") 411 REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | complete in prod with parse failure")
382 COUNTS[review_parse_failure]=$((COUNTS[review_parse_failure] + 1)) 412 COUNTS[review_parse_failure]=$((COUNTS[review_parse_failure] + 1))
383 else 413 else
384 # Needs resync - include purgatory context 414 # Check git ancestry to see if archive is actually ahead
385 context=$(get_context "$key" "$prod_status" "$archive_status") 415 local git_relationship="${GIT_ANCESTRY[$key]:-unknown}"
386 if [[ "$archive_cat" == "missing" ]]; then 416
387 RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync to archive") 417 if [[ "$git_relationship" == "archive-ahead" || "$git_relationship" == "in-sync" ]]; then
388 COUNTS[resync_missing_archive]=$((COUNTS[resync_missing_archive] + 1)) 418 # Archive has newer/same git data - this is GOOD
419 # Archive's git data was authorized by a state event (GRASP enforced)
420 context=$(get_context "$key" "$prod_status" "$archive_status")
421 if [[ -n "$context" && "$context" != "none" ]]; then
422 context="$context, git=$git_relationship"
423 else
424 context="git=$git_relationship"
425 fi
426 READY_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | archive ahead (use archive data)")
427 COUNTS[ready_archive_ahead]=$((COUNTS[ready_archive_ahead] + 1))
428 elif [[ "$git_relationship" == "diverged" ]]; then
429 # Git histories diverged - needs manual review
430 context=$(get_context "$key" "$prod_status" "$archive_status")
431 if [[ -n "$context" && "$context" != "none" ]]; then
432 context="$context, git=diverged"
433 else
434 context="git=diverged"
435 fi
436 REVIEW_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | git histories diverged (manual review)")
437 COUNTS[review_diverged]=$((COUNTS[review_diverged] + 1))
389 else 438 else
390 RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync (archive incomplete)") 439 # prod-ahead, archive-only, prod-only, both-empty, or unknown
391 COUNTS[resync_incomplete_archive]=$((COUNTS[resync_incomplete_archive] + 1)) 440 # These need resync - include purgatory context
441 context=$(get_context "$key" "$prod_status" "$archive_status")
442 if [[ "$git_relationship" != "unknown" ]]; then
443 if [[ -n "$context" && "$context" != "none" ]]; then
444 context="$context, git=$git_relationship"
445 else
446 context="git=$git_relationship"
447 fi
448 fi
449 if [[ "$archive_cat" == "missing" ]]; then
450 RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync to archive")
451 COUNTS[resync_missing_archive]=$((COUNTS[resync_missing_archive] + 1))
452 else
453 RESYNC_LINES+=("$repo | $npub | $prod_status | $archive_status | $context | trigger re-sync (archive incomplete)")
454 COUNTS[resync_incomplete_archive]=$((COUNTS[resync_incomplete_archive] + 1))
455 fi
392 fi 456 fi
393 fi 457 fi
394 fi 458 fi
@@ -498,6 +562,7 @@ fi
498 echo "| Reason | Count |" 562 echo "| Reason | Count |"
499 echo "|--------|-------|" 563 echo "|--------|-------|"
500 echo "| complete in both prod and archive | ${COUNTS[ready_complete_both]} |" 564 echo "| complete in both prod and archive | ${COUNTS[ready_complete_both]} |"
565 echo "| archive ahead (has newer git data) | ${COUNTS[ready_archive_ahead]} |"
501 echo "| deleted by user | ${COUNTS[ready_deleted]} |" 566 echo "| deleted by user | ${COUNTS[ready_deleted]} |"
502 echo "| empty in prod (user never pushed) | ${COUNTS[ready_empty_prod]} |" 567 echo "| empty in prod (user never pushed) | ${COUNTS[ready_empty_prod]} |"
503 echo "| archive-only (not in prod) | ${COUNTS[ready_archive_only]} |" 568 echo "| archive-only (not in prod) | ${COUNTS[ready_archive_only]} |"
@@ -527,6 +592,7 @@ fi
527 echo "| partial in prod (cat3) | ${COUNTS[review_partial_prod]} |" 592 echo "| partial in prod (cat3) | ${COUNTS[review_partial_prod]} |"
528 echo "| no-match in prod (cat4) | ${COUNTS[review_nomatch_prod]} |" 593 echo "| no-match in prod (cat4) | ${COUNTS[review_nomatch_prod]} |"
529 echo "| complete in prod with parse failure | ${COUNTS[review_parse_failure]} |" 594 echo "| complete in prod with parse failure | ${COUNTS[review_parse_failure]} |"
595 echo "| git histories diverged | ${COUNTS[review_diverged]} |"
530 echo "" 596 echo ""
531 echo "## Input Data Summary" 597 echo "## Input Data Summary"
532 echo "" 598 echo ""
@@ -571,6 +637,7 @@ echo ""
571echo "=== Summary ===" 637echo "=== Summary ==="
572echo "Ready for Migration: $TOTAL_READY ($PCT_READY%)" 638echo "Ready for Migration: $TOTAL_READY ($PCT_READY%)"
573echo " - Complete in both: ${COUNTS[ready_complete_both]}" 639echo " - Complete in both: ${COUNTS[ready_complete_both]}"
640echo " - Archive ahead: ${COUNTS[ready_archive_ahead]}"
574echo " - Deleted by user: ${COUNTS[ready_deleted]}" 641echo " - Deleted by user: ${COUNTS[ready_deleted]}"
575echo " - Empty in prod: ${COUNTS[ready_empty_prod]}" 642echo " - Empty in prod: ${COUNTS[ready_empty_prod]}"
576echo " - Archive-only: ${COUNTS[ready_archive_only]}" 643echo " - Archive-only: ${COUNTS[ready_archive_only]}"
@@ -584,6 +651,7 @@ echo "Manual Review: $TOTAL_REVIEW ($PCT_REVIEW%)"
584echo " - Partial in prod: ${COUNTS[review_partial_prod]}" 651echo " - Partial in prod: ${COUNTS[review_partial_prod]}"
585echo " - No-match in prod: ${COUNTS[review_nomatch_prod]}" 652echo " - No-match in prod: ${COUNTS[review_nomatch_prod]}"
586echo " - Parse failures: ${COUNTS[review_parse_failure]}" 653echo " - Parse failures: ${COUNTS[review_parse_failure]}"
654echo " - Git diverged: ${COUNTS[review_diverged]}"
587echo "" 655echo ""
588echo "Total: $TOTAL repos" 656echo "Total: $TOTAL repos"
589echo "" 657echo ""
diff --git a/docs/how-to/migration-scripts/run-migration-analysis.sh b/docs/how-to/migration-scripts/run-migration-analysis.sh
index 089b553..acc5e44 100755
--- a/docs/how-to/migration-scripts/run-migration-analysis.sh
+++ b/docs/how-to/migration-scripts/run-migration-analysis.sh
@@ -320,7 +320,7 @@ check_prerequisites() {
320 fi 320 fi
321 321
322 # Check scripts exist 322 # Check scripts exist
323 for script in 01-fetch-events.sh 10-check-git-sync.sh 20-categorize.sh 21-compare-relays.sh 30-extract-parse-failures.sh 31-extract-purgatory-expiry.sh 40-classify-actions.sh; do 323 for script in 01-fetch-events.sh 10-check-git-sync.sh 20-categorize.sh 21-compare-relays.sh 22-compare-git-data.sh 30-extract-parse-failures.sh 31-extract-purgatory-expiry.sh 40-classify-actions.sh; do
324 if [[ ! -x "$SCRIPT_DIR/$script" ]]; then 324 if [[ ! -x "$SCRIPT_DIR/$script" ]]; then
325 log_error "Script not found or not executable: $SCRIPT_DIR/$script" 325 log_error "Script not found or not executable: $SCRIPT_DIR/$script"
326 missing=1 326 missing=1
@@ -551,6 +551,20 @@ run_phase_3() {
551 fi 551 fi
552 552
553 run_phase 3 "Categorize & Compare (fast)" "${cmds[@]}" 553 run_phase 3 "Categorize & Compare (fast)" "${cmds[@]}"
554
555 # Phase 3c: Compare git data between relays (requires git paths)
556 # This determines if archive is ahead of prod for repos with mismatched state
557 if [[ -n "$PROD_GIT" && -n "$ARCHIVE_GIT" ]]; then
558 # Build list of repos to compare: those where prod=complete but archive is not
559 local repos_to_compare="$OUTPUT_DIR/comparison/complete-prod-incomplete-archive.txt"
560 if [[ -f "$repos_to_compare" ]] && [[ ! -f "$OUTPUT_DIR/comparison/git-ancestry.tsv" ]]; then
561 log_info "Running git ancestry comparison (Phase 3c)..."
562 run_phase 3 "Git Ancestry Comparison" "'$SCRIPT_DIR/22-compare-git-data.sh' '$PROD_GIT' '$ARCHIVE_GIT' '$repos_to_compare' '$OUTPUT_DIR/comparison'"
563 fi
564 else
565 log_warn "Git paths not provided - skipping git ancestry comparison"
566 log_warn "Without git comparison, repos where archive is ahead will be incorrectly flagged as needing re-sync"
567 fi
554} 568}
555 569
556# Phase 4: Extract logs 570# Phase 4: Extract logs