1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
|
#!/usr/bin/env bash
#
# 22-compare-git-data.sh - Compare actual git data between prod and archive relays
#
# PHASE 3c of the GRASP relay to ngit-grasp migration analysis pipeline.
# Compares actual git commits between prod and archive to determine which is ahead.
#
# KEY INSIGHT:
# Archive (ngit-grasp) enforces GRASP - git data ALWAYS matches a state event.
# If archive has different/newer data than prod, it means:
# - A state event authorized those commits at some point
# - Archive is actually MORE up-to-date than prod
# - Migration should use archive data (it's already correct)
#
# USAGE:
# ./22-compare-git-data.sh <prod-git-base> <archive-git-base> <repo-list> <output-dir>
#
# EXAMPLES:
# ./22-compare-git-data.sh /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \
# output/comparison/complete-prod-incomplete-archive.txt output/comparison
#
# INPUT:
# prod-git-base Base directory for prod git repos (e.g., /var/lib/grasp-relay/git)
# archive-git-base Base directory for archive git repos (e.g., /var/lib/ngit-grasp/git)
# repo-list File with repos to compare (format: "repo | npub | ...")
#
# OUTPUT:
# <output-dir>/git-ancestry.tsv - Tab-separated values:
# repo<TAB>npub<TAB>relationship<TAB>details
#
# Relationship values:
# archive-ahead - Archive has all prod commits plus more (GOOD - use archive)
# in-sync - Both have identical commits
# prod-ahead - Prod has commits archive is missing (needs re-sync)
# diverged - Both have unique commits (manual review)
# archive-only - Only archive has git data
# prod-only - Only prod has git data
# both-empty - Neither has git data
#
# PREREQUISITES:
# - git (for ref comparison)
# - Read access to both git directories (may need sudo)
#
# RUNTIME: Depends on number of repos to compare
#
# SEE ALSO:
# docs/how-to/migrate-to-ngit-grasp.md - Full migration guide
# 21-compare-relays.sh - Phase 3b script that identifies repos to compare
#
set -euo pipefail
# Colors for output (disabled if not a terminal)
if [[ -t 1 ]]; then
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
else
RED=''
GREEN=''
YELLOW=''
BLUE=''
NC=''
fi
log_info() {
echo -e "${BLUE}[INFO]${NC} $*" >&2
}
log_success() {
echo -e "${GREEN}[OK]${NC} $*" >&2
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $*" >&2
}
log_error() {
echo -e "${RED}[ERROR]${NC} $*" >&2
}
log_progress() {
echo -ne "\r${BLUE}[PROGRESS]${NC} $*" >&2
}
usage() {
echo "Usage: $0 <prod-git-base> <archive-git-base> <repo-list> <output-dir>"
echo ""
echo "Arguments:"
echo " prod-git-base Base directory for prod git repos"
echo " archive-git-base Base directory for archive git repos"
echo " repo-list File with repos to compare (format: 'repo | npub | ...')"
echo " output-dir Directory to store output files"
echo ""
echo "Examples:"
echo " $0 /var/lib/grasp-relay/git /var/lib/ngit-grasp/git \\"
echo " output/comparison/complete-prod-incomplete-archive.txt output/comparison"
echo ""
echo "Output:"
echo " git-ancestry.tsv - TSV with: repo, npub, relationship, details"
exit 1
}
# Get all branch refs from a git directory
# Args: $1=git_dir
# Returns: sorted list of "ref_name commit_hash" lines
get_git_refs() {
local git_dir="$1"
if [[ ! -d "$git_dir" ]]; then
return
fi
git --git-dir="$git_dir" show-ref --heads 2>/dev/null | sort || true
}
# Check if commit A is ancestor of commit B
# Args: $1=git_dir, $2=commit_a, $3=commit_b
# Returns: 0 if A is ancestor of B, 1 otherwise
is_ancestor() {
local git_dir="$1"
local commit_a="$2"
local commit_b="$3"
git --git-dir="$git_dir" merge-base --is-ancestor "$commit_a" "$commit_b" 2>/dev/null
}
# Compare git data between prod and archive for a single repo
# Args: $1=prod_git_dir, $2=archive_git_dir
# Returns: relationship string
compare_repo_git() {
local prod_git="$1"
local archive_git="$2"
local prod_exists=false
local archive_exists=false
[[ -d "$prod_git" ]] && prod_exists=true
[[ -d "$archive_git" ]] && archive_exists=true
# Handle cases where one or both don't exist
if [[ "$prod_exists" == "false" && "$archive_exists" == "false" ]]; then
echo "both-empty"
return
fi
if [[ "$prod_exists" == "false" ]]; then
echo "archive-only"
return
fi
if [[ "$archive_exists" == "false" ]]; then
echo "prod-only"
return
fi
# Both exist - get refs
local prod_refs archive_refs
prod_refs=$(get_git_refs "$prod_git")
archive_refs=$(get_git_refs "$archive_git")
# Handle empty refs
if [[ -z "$prod_refs" && -z "$archive_refs" ]]; then
echo "both-empty"
return
fi
if [[ -z "$prod_refs" ]]; then
echo "archive-only"
return
fi
if [[ -z "$archive_refs" ]]; then
echo "prod-only"
return
fi
# Compare refs - check if they're identical
if [[ "$prod_refs" == "$archive_refs" ]]; then
echo "in-sync"
return
fi
# Refs differ - need to check ancestry
# Strategy: For each branch, check if one is ancestor of the other
# If all archive branches are ahead of or equal to prod branches, archive is ahead
# If all prod branches are ahead of or equal to archive branches, prod is ahead
# Otherwise, they've diverged
local archive_ahead=true
local prod_ahead=true
local has_common_branch=false
# Create temporary file to use archive as reference repo for ancestry checks
# We need a repo that has both sets of commits to check ancestry
# Use archive since it's the target and should have the superset
# Check each prod branch against archive
while read -r prod_hash prod_ref; do
[[ -z "$prod_hash" ]] && continue
# Get the same branch from archive
local archive_hash
archive_hash=$(echo "$archive_refs" | grep " $prod_ref$" | awk '{print $1}' || echo "")
if [[ -z "$archive_hash" ]]; then
# Branch exists in prod but not archive - prod has something archive doesn't
# But this could be a deleted branch, so don't immediately say prod is ahead
continue
fi
has_common_branch=true
if [[ "$prod_hash" == "$archive_hash" ]]; then
# Same commit - neither ahead for this branch
continue
fi
# Different commits - check ancestry
# First, try to check if prod is ancestor of archive (archive ahead)
if is_ancestor "$archive_git" "$prod_hash" "$archive_hash" 2>/dev/null; then
# Prod commit is ancestor of archive commit - archive is ahead for this branch
prod_ahead=false
elif is_ancestor "$archive_git" "$archive_hash" "$prod_hash" 2>/dev/null; then
# Archive commit is ancestor of prod commit - prod is ahead for this branch
archive_ahead=false
else
# Neither is ancestor - diverged
archive_ahead=false
prod_ahead=false
fi
done <<< "$prod_refs"
# Also check for branches only in archive (archive has extra branches)
while read -r archive_hash archive_ref; do
[[ -z "$archive_hash" ]] && continue
local prod_hash
prod_hash=$(echo "$prod_refs" | grep " $archive_ref$" | awk '{print $1}' || echo "")
if [[ -z "$prod_hash" ]]; then
# Branch exists in archive but not prod - archive has something prod doesn't
# This means archive is ahead (has extra branches)
prod_ahead=false
fi
done <<< "$archive_refs"
# Determine final relationship
if [[ "$has_common_branch" == "false" ]]; then
# No common branches - completely different
echo "diverged"
return
fi
if [[ "$archive_ahead" == "true" && "$prod_ahead" == "false" ]]; then
echo "archive-ahead"
elif [[ "$prod_ahead" == "true" && "$archive_ahead" == "false" ]]; then
echo "prod-ahead"
elif [[ "$archive_ahead" == "true" && "$prod_ahead" == "true" ]]; then
# Both true means all common branches are identical
# But one might have extra branches
echo "in-sync"
else
echo "diverged"
fi
}
# Main
main() {
if [[ $# -ne 4 ]]; then
usage
fi
local prod_git_base="$1"
local archive_git_base="$2"
local repo_list="$3"
local output_dir="$4"
# Validate inputs
if [[ ! -d "$prod_git_base" ]]; then
log_error "Prod git base directory not found: $prod_git_base"
exit 1
fi
if [[ ! -d "$archive_git_base" ]]; then
log_error "Archive git base directory not found: $archive_git_base"
exit 1
fi
if [[ ! -f "$repo_list" ]]; then
log_error "Repo list file not found: $repo_list"
exit 1
fi
log_info "=== Git Data Comparison ==="
log_info "Prod git base: $prod_git_base"
log_info "Archive git base: $archive_git_base"
log_info "Repo list: $repo_list"
log_info "Output: $output_dir"
log_info "Started: $(date)"
echo ""
# Create output directory
mkdir -p "$output_dir"
# Output file
local tsv_file="$output_dir/git-ancestry.tsv"
# Initialize TSV with header
echo -e "repo\tnpub\trelationship\tdetails" > "$tsv_file"
# Count repos
local total_repos
total_repos=$(grep -c -v '^#' "$repo_list" 2>/dev/null || echo "0")
log_info "Processing $total_repos repos..."
echo ""
# Counters
local count=0
local count_archive_ahead=0
local count_in_sync=0
local count_prod_ahead=0
local count_diverged=0
local count_archive_only=0
local count_prod_only=0
local count_both_empty=0
# Process each repo
while IFS='|' read -r repo npub rest || [[ -n "$repo" ]]; do
# Skip comments and empty lines
[[ "$repo" =~ ^# ]] && continue
[[ -z "$repo" ]] && continue
# Clean up whitespace
repo="${repo// /}"
npub="${npub// /}"
[[ -z "$repo" || -z "$npub" ]] && continue
count=$((count + 1))
# Build git paths
local prod_git="$prod_git_base/${npub}/${repo}.git"
local archive_git="$archive_git_base/${npub}/${repo}.git"
# Compare
local relationship details=""
relationship=$(compare_repo_git "$prod_git" "$archive_git")
# Count by relationship
case "$relationship" in
archive-ahead) count_archive_ahead=$((count_archive_ahead + 1)) ;;
in-sync) count_in_sync=$((count_in_sync + 1)) ;;
prod-ahead) count_prod_ahead=$((count_prod_ahead + 1)) ;;
diverged) count_diverged=$((count_diverged + 1)) ;;
archive-only) count_archive_only=$((count_archive_only + 1)) ;;
prod-only) count_prod_only=$((count_prod_only + 1)) ;;
both-empty) count_both_empty=$((count_both_empty + 1)) ;;
esac
# Output TSV line
printf '%s\t%s\t%s\t%s\n' "$repo" "$npub" "$relationship" "$details" >> "$tsv_file"
# Progress indicator every 10 repos
if [[ $((count % 10)) -eq 0 ]]; then
log_progress "Processed $count/$total_repos repos..."
fi
done < "$repo_list"
# Clear progress line
echo "" >&2
# Summary
echo ""
log_info "=== Comparison Summary ==="
log_success "Archive ahead (use archive data): $count_archive_ahead"
log_success "In sync: $count_in_sync"
log_warn "Prod ahead (needs re-sync): $count_prod_ahead"
log_error "Diverged (manual review): $count_diverged"
log_info "Archive only: $count_archive_only"
log_info "Prod only: $count_prod_only"
log_info "Both empty: $count_both_empty"
echo ""
log_info "Total: $count repos"
log_info "Output: $tsv_file"
}
main "$@"
|