upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/docs/how-to/migration-scripts/22-compare-git-data.sh
diff options
context:
space:
mode:
Diffstat (limited to 'docs/how-to/migration-scripts/22-compare-git-data.sh')
-rwxr-xr-xdocs/how-to/migration-scripts/22-compare-git-data.sh390
1 files changed, 390 insertions, 0 deletions
diff --git a/docs/how-to/migration-scripts/22-compare-git-data.sh b/docs/how-to/migration-scripts/22-compare-git-data.sh
new file mode 100755
index 0000000..76521d4
--- /dev/null
+++ b/docs/how-to/migration-scripts/22-compare-git-data.sh
@@ -0,0 +1,390 @@
1#!/usr/bin/env bash
2#
3# 22-compare-git-data.sh - Compare actual git data between prod and archive relays
4#
5# PHASE 3c of the GRASP relay to ngit-grasp migration analysis pipeline.
6# Compares actual git commits between prod and archive to determine which is ahead.
7#
8# KEY INSIGHT:
9# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event.
10# If archive has different/newer data than prod, it means:
11# - A state event authorized those commits at some point
12# - Archive is actually MORE up-to-date than prod
13# - Migration should use archive data (it's already correct)
14#
15# USAGE:
16# ./22-compare-git-data.sh <prod-git-base> <archive-git-base> <repo-list> <output-dir>
17#
18# EXAMPLES:
19# ./22-compare-git-data.sh /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \
20# output/comparison/complete-prod-incomplete-archive.txt output/comparison
21#
22# INPUT:
23# prod-git-base Base directory for prod git repos (e.g., /var/lib/grasp-relay/git)
24# archive-git-base Base directory for archive git repos (e.g., /var/lib/ngit-grasp/git)
25# repo-list File with repos to compare (format: "repo | npub | ...")
26#
27# OUTPUT:
28# <output-dir>/git-ancestry.tsv - Tab-separated values:
29# repo<TAB>npub<TAB>relationship<TAB>details
30#
31# Relationship values:
32# archive-ahead - Archive has all prod commits plus more (GOOD - use archive)
33# in-sync - Both have identical commits
34# prod-ahead - Prod has commits archive is missing (needs re-sync)
35# diverged - Both have unique commits (manual review)
36# archive-only - Only archive has git data
37# prod-only - Only prod has git data
38# both-empty - Neither has git data
39#
40# PREREQUISITES:
41# - git (for ref comparison)
42# - Read access to both git directories (may need sudo)
43#
44# RUNTIME: Depends on number of repos to compare
45#
46# SEE ALSO:
47# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide
48# 21-compare-relays.sh - Phase 3b script that identifies repos to compare
49#
50
51set -euo pipefail
52
53# Colors for output (disabled if not a terminal)
54if [[ -t 1 ]]; then
55 RED='\033[0;31m'
56 GREEN='\033[0;32m'
57 YELLOW='\033[0;33m'
58 BLUE='\033[0;34m'
59 NC='\033[0m'
60else
61 RED=''
62 GREEN=''
63 YELLOW=''
64 BLUE=''
65 NC=''
66fi
67
68log_info() {
69 echo -e "${BLUE}[INFO]${NC} $*" >&2
70}
71
72log_success() {
73 echo -e "${GREEN}[OK]${NC} $*" >&2
74}
75
76log_warn() {
77 echo -e "${YELLOW}[WARN]${NC} $*" >&2
78}
79
80log_error() {
81 echo -e "${RED}[ERROR]${NC} $*" >&2
82}
83
84log_progress() {
85 echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2
86}
87
88usage() {
89 echo "Usage: $0 <prod-git-base> <archive-git-base> <repo-list> <output-dir>"
90 echo ""
91 echo "Arguments:"
92 echo " prod-git-base Base directory for prod git repos"
93 echo " archive-git-base Base directory for archive git repos"
94 echo " repo-list File with repos to compare (format: 'repo | npub | ...')"
95 echo " output-dir Directory to store output files"
96 echo ""
97 echo "Examples:"
98 echo " $0 /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \\"
99 echo " output/comparison/complete-prod-incomplete-archive.txt output/comparison"
100 echo ""
101 echo "Output:"
102 echo " git-ancestry.tsv - TSV with: repo, npub, relationship, details"
103 exit 1
104}
105
106# Get all branch refs from a git directory
107# Args: $1=git_dir
108# Returns: sorted list of "ref_name commit_hash" lines
109get_git_refs() {
110 local git_dir="$1"
111
112 if [[ ! -d "$git_dir" ]]; then
113 return
114 fi
115
116 git --git-dir="$git_dir" show-ref --heads 2>/dev/null | sort || true
117}
118
119# Check if commit A is ancestor of commit B
120# Args: $1=git_dir, $2=commit_a, $3=commit_b
121# Returns: 0 if A is ancestor of B, 1 otherwise
122is_ancestor() {
123 local git_dir="$1"
124 local commit_a="$2"
125 local commit_b="$3"
126
127 git --git-dir="$git_dir" merge-base --is-ancestor "$commit_a" "$commit_b" 2>/dev/null
128}
129
130# Compare git data between prod and archive for a single repo
131# Args: $1=prod_git_dir, $2=archive_git_dir
132# Returns: relationship string
133compare_repo_git() {
134 local prod_git="$1"
135 local archive_git="$2"
136
137 local prod_exists=false
138 local archive_exists=false
139
140 [[ -d "$prod_git" ]] && prod_exists=true
141 [[ -d "$archive_git" ]] && archive_exists=true
142
143 # Handle cases where one or both don't exist
144 if [[ "$prod_exists" == "false" && "$archive_exists" == "false" ]]; then
145 echo "both-empty"
146 return
147 fi
148
149 if [[ "$prod_exists" == "false" ]]; then
150 echo "archive-only"
151 return
152 fi
153
154 if [[ "$archive_exists" == "false" ]]; then
155 echo "prod-only"
156 return
157 fi
158
159 # Both exist - get refs
160 local prod_refs archive_refs
161 prod_refs=$(get_git_refs "$prod_git")
162 archive_refs=$(get_git_refs "$archive_git")
163
164 # Handle empty refs
165 if [[ -z "$prod_refs" && -z "$archive_refs" ]]; then
166 echo "both-empty"
167 return
168 fi
169
170 if [[ -z "$prod_refs" ]]; then
171 echo "archive-only"
172 return
173 fi
174
175 if [[ -z "$archive_refs" ]]; then
176 echo "prod-only"
177 return
178 fi
179
180 # Compare refs - check if they're identical
181 if [[ "$prod_refs" == "$archive_refs" ]]; then
182 echo "in-sync"
183 return
184 fi
185
186 # Refs differ - need to check ancestry
187 # Strategy: For each branch, check if one is ancestor of the other
188 # If all archive branches are ahead of or equal to prod branches, archive is ahead
189 # If all prod branches are ahead of or equal to archive branches, prod is ahead
190 # Otherwise, they've diverged
191
192 local archive_ahead=true
193 local prod_ahead=true
194 local has_common_branch=false
195
196 # Create temporary file to use archive as reference repo for ancestry checks
197 # We need a repo that has both sets of commits to check ancestry
198 # Use archive since it's the target and should have the superset
199
200 # Check each prod branch against archive
201 while read -r prod_hash prod_ref; do
202 [[ -z "$prod_hash" ]] && continue
203
204 # Get the same branch from archive
205 local archive_hash
206 archive_hash=$(echo "$archive_refs" | grep " $prod_ref$" | awk '{print $1}' || echo "")
207
208 if [[ -z "$archive_hash" ]]; then
209 # Branch exists in prod but not archive - prod has something archive doesn't
210 # But this could be a deleted branch, so don't immediately say prod is ahead
211 continue
212 fi
213
214 has_common_branch=true
215
216 if [[ "$prod_hash" == "$archive_hash" ]]; then
217 # Same commit - neither ahead for this branch
218 continue
219 fi
220
221 # Different commits - check ancestry
222 # First, try to check if prod is ancestor of archive (archive ahead)
223 if is_ancestor "$archive_git" "$prod_hash" "$archive_hash" 2>/dev/null; then
224 # Prod commit is ancestor of archive commit - archive is ahead for this branch
225 prod_ahead=false
226 elif is_ancestor "$archive_git" "$archive_hash" "$prod_hash" 2>/dev/null; then
227 # Archive commit is ancestor of prod commit - prod is ahead for this branch
228 archive_ahead=false
229 else
230 # Neither is ancestor - diverged
231 archive_ahead=false
232 prod_ahead=false
233 fi
234 done <<< "$prod_refs"
235
236 # Also check for branches only in archive (archive has extra branches)
237 while read -r archive_hash archive_ref; do
238 [[ -z "$archive_hash" ]] && continue
239
240 local prod_hash
241 prod_hash=$(echo "$prod_refs" | grep " $archive_ref$" | awk '{print $1}' || echo "")
242
243 if [[ -z "$prod_hash" ]]; then
244 # Branch exists in archive but not prod - archive has something prod doesn't
245 # This means archive is ahead (has extra branches)
246 prod_ahead=false
247 fi
248 done <<< "$archive_refs"
249
250 # Determine final relationship
251 if [[ "$has_common_branch" == "false" ]]; then
252 # No common branches - completely different
253 echo "diverged"
254 return
255 fi
256
257 if [[ "$archive_ahead" == "true" && "$prod_ahead" == "false" ]]; then
258 echo "archive-ahead"
259 elif [[ "$prod_ahead" == "true" && "$archive_ahead" == "false" ]]; then
260 echo "prod-ahead"
261 elif [[ "$archive_ahead" == "true" && "$prod_ahead" == "true" ]]; then
262 # Both true means all common branches are identical
263 # But one might have extra branches
264 echo "in-sync"
265 else
266 echo "diverged"
267 fi
268}
269
270# Main
271main() {
272 if [[ $# -ne 4 ]]; then
273 usage
274 fi
275
276 local prod_git_base="$1"
277 local archive_git_base="$2"
278 local repo_list="$3"
279 local output_dir="$4"
280
281 # Validate inputs
282 if [[ ! -d "$prod_git_base" ]]; then
283 log_error "Prod git base directory not found: $prod_git_base"
284 exit 1
285 fi
286
287 if [[ ! -d "$archive_git_base" ]]; then
288 log_error "Archive git base directory not found: $archive_git_base"
289 exit 1
290 fi
291
292 if [[ ! -f "$repo_list" ]]; then
293 log_error "Repo list file not found: $repo_list"
294 exit 1
295 fi
296
297 log_info "=== Git Data Comparison ==="
298 log_info "Prod git base: $prod_git_base"
299 log_info "Archive git base: $archive_git_base"
300 log_info "Repo list: $repo_list"
301 log_info "Output: $output_dir"
302 log_info "Started: $(date)"
303 echo ""
304
305 # Create output directory
306 mkdir -p "$output_dir"
307
308 # Output file
309 local tsv_file="$output_dir/git-ancestry.tsv"
310
311 # Initialize TSV with header
312 echo -e "repo\tnpub\trelationship\tdetails" > "$tsv_file"
313
314 # Count repos
315 local total_repos
316 total_repos=$(grep -c -v '^#' "$repo_list" 2>/dev/null || echo "0")
317 log_info "Processing $total_repos repos..."
318 echo ""
319
320 # Counters
321 local count=0
322 local count_archive_ahead=0
323 local count_in_sync=0
324 local count_prod_ahead=0
325 local count_diverged=0
326 local count_archive_only=0
327 local count_prod_only=0
328 local count_both_empty=0
329
330 # Process each repo
331 while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do
332 # Skip comments and empty lines
333 [[ "$repo" =~ ^# ]] && continue
334 [[ -z "$repo" ]] && continue
335
336 # Clean up whitespace
337 repo="${repo// /}"
338 npub="${npub// /}"
339
340 [[ -z "$repo" || -z "$npub" ]] && continue
341
342 count=$((count + 1))
343
344 # Build git paths
345 local prod_git="$prod_git_base/${npub}/${repo}.git"
346 local archive_git="$archive_git_base/${npub}/${repo}.git"
347
348 # Compare
349 local relationship details=""
350 relationship=$(compare_repo_git "$prod_git" "$archive_git")
351
352 # Count by relationship
353 case "$relationship" in
354 archive-ahead) count_archive_ahead=$((count_archive_ahead + 1)) ;;
355 in-sync) count_in_sync=$((count_in_sync + 1)) ;;
356 prod-ahead) count_prod_ahead=$((count_prod_ahead + 1)) ;;
357 diverged) count_diverged=$((count_diverged + 1)) ;;
358 archive-only) count_archive_only=$((count_archive_only + 1)) ;;
359 prod-only) count_prod_only=$((count_prod_only + 1)) ;;
360 both-empty) count_both_empty=$((count_both_empty + 1)) ;;
361 esac
362
363 # Output TSV line
364 printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$relationship" "$details" >> "$tsv_file"
365
366 # Progress indicator every 10 repos
367 if [[ $((count % 10)) -eq 0 ]]; then
368 log_progress "Processed $count/$total_repos repos..."
369 fi
370 done < "$repo_list"
371
372 # Clear progress line
373 echo "" >&2
374
375 # Summary
376 echo ""
377 log_info "=== Comparison Summary ==="
378 log_success "Archive ahead (use archive data): $count_archive_ahead"
379 log_success "In sync: $count_in_sync"
380 log_warn "Prod ahead (needs re-sync): $count_prod_ahead"
381 log_error "Diverged (manual review): $count_diverged"
382 log_info "Archive only: $count_archive_only"
383 log_info "Prod only: $count_prod_only"
384 log_info "Both empty: $count_both_empty"
385 echo ""
386 log_info "Total: $count repos"
387 log_info "Output: $tsv_file"
388}
389
390main "$@"