upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/docs/how-to/migration-scripts/30-extract-parse-failures.sh
diff options
context:
space:
mode:
Diffstat (limited to 'docs/how-to/migration-scripts/30-extract-parse-failures.sh')
-rwxr-xr-xdocs/how-to/migration-scripts/30-extract-parse-failures.sh178
1 files changed, 175 insertions, 3 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index 7870c61..f821834 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -125,18 +125,25 @@ usage() {
125 echo " output-dir Directory to store extracted log data" 125 echo " output-dir Directory to store extracted log data"
126 echo "" 126 echo ""
127 echo "Options:" 127 echo "Options:"
128 echo " --since <date> Start date (default: 30 days ago)" 128 echo " --since <date> Start date (default: 30 days ago)"
129 echo " --until <date> End date (default: now)" 129 echo " --until <date> End date (default: now)"
130 echo " --dry-run Show what would be extracted without writing" 130 echo " --analysis-root <dir> Filter to only missing announcements from analysis"
131 echo " --dry-run Show what would be extracted without writing"
131 echo "" 132 echo ""
132 echo "Examples:" 133 echo "Examples:"
133 echo " $0 ngit-grasp.service output/logs" 134 echo " $0 ngit-grasp.service output/logs"
134 echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" 135 echo " $0 ngit-grasp.service output/logs --since '2026-01-01'"
135 echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" 136 echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'"
137 echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123"
136 echo "" 138 echo ""
137 echo "Expected log formats:" 139 echo "Expected log formats:"
138 echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." 140 echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..."
139 echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." 141 echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..."
142 echo ""
143 echo "Filtering with --analysis-root:"
144 echo " When provided, only parse failures for announcements that are in production"
145 echo " but missing from the archive will be included. This filters out rejections"
146 echo " for events from other relays that don't affect the migration."
140 exit 1 147 exit 1
141} 148}
142 149
@@ -204,6 +211,155 @@ parse_write_policy_rejection_line() {
204# the same event to be counted twice. Write policy logs contain the same 211# the same event to be counted twice. Write policy logs contain the same
205# events, so we don't lose any data by only extracting from that source. 212# events, so we don't lose any data by only extracting from that source.
206 213
214# Filter parse failures to only those for missing announcements
215# This is used when --analysis-root is provided to scope results to the migration
216#
217# Arguments:
218# $1 - parse failures file to filter (modified in place)
219# $2 - analysis root directory containing comparison/ and prod/ subdirs
220#
221# The function:
222# 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt
223# 2. Extracts pubkey/identifier pairs for those announcements
224# 3. Reads production announcements from prod/raw/announcements.json
225# 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs
226# 5. Filters parse failures to only those event IDs
227filter_to_missing_announcements() {
228 local parse_failures_file="$1"
229 local analysis_root="$2"
230
231 local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt"
232 local prod_announcements="$analysis_root/prod/raw/announcements.json"
233
234 # Validate required files exist
235 if [[ ! -f "$missing_file" ]]; then
236 log_warn "Missing announcements file not found: $missing_file"
237 log_warn "Skipping filter - all parse failures will be included"
238 return 0
239 fi
240
241 if [[ ! -f "$prod_announcements" ]]; then
242 log_warn "Production announcements file not found: $prod_announcements"
243 log_warn "Skipping filter - all parse failures will be included"
244 return 0
245 fi
246
247 # Check if jq is available
248 if ! command -v jq &> /dev/null; then
249 log_warn "jq not found - cannot filter parse failures"
250 log_warn "Install jq or run without --analysis-root"
251 return 0
252 fi
253
254 log_info "Filtering parse failures to missing announcements only..."
255
256 # Step 1: Extract pubkey/identifier pairs from missing announcements
257 # Format: identifier | npub | prod=complete | archive=missing
258 local missing_pairs_file
259 missing_pairs_file=$(mktemp)
260
261 # Extract identifier and npub, convert npub to hex pubkey for matching
262 while IFS=' | ' read -r identifier npub rest; do
263 # Skip empty lines
264 [[ -z "$identifier" ]] && continue
265 # Trim whitespace
266 identifier=$(echo "$identifier" | xargs)
267 npub=$(echo "$npub" | xargs)
268 echo "${identifier}|${npub}"
269 done < "$missing_file" > "$missing_pairs_file"
270
271 local missing_count
272 missing_count=$(wc -l < "$missing_pairs_file")
273 missing_count="${missing_count//[^0-9]/}"
274 log_info " Found $missing_count missing announcements to filter for"
275
276 # Step 2: Get event IDs from production announcements for these pairs
277 # We need to match on 'd' tag (identifier) and pubkey
278 local missing_event_ids_file
279 missing_event_ids_file=$(mktemp)
280
281 # Create a lookup of identifier|npub -> event_id from production announcements
282 # The JSON has: id, pubkey (hex), tags (array with ["d", identifier])
283 log_info " Extracting event IDs from production announcements..."
284
285 # Use jq to extract id, pubkey, and d-tag value, then filter
286 # Output format: event_id|identifier|pubkey_hex
287 # Note: The JSON file is NDJSON (newline-delimited), not an array
288 jq -r 'select(.kind == 30617) |
289 .id as $id |
290 .pubkey as $pubkey |
291 (.tags[] | select(.[0] == "d") | .[1]) as $dtag |
292 "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || {
293 log_warn "Failed to parse production announcements JSON"
294 rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all"
295 return 0
296 }
297
298 # Now filter to only event IDs for missing announcements
299 # We need to convert npub to hex pubkey for comparison
300 # npub is bech32, pubkey in JSON is hex
301 # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey
302 # Actually, we need both because same identifier can exist for different pubkeys
303
304 # Create a set of "identifier|pubkey_hex" to match against
305 # First, we need to convert npub to hex - but that requires a tool
306 # Alternative: match on identifier only and accept some false positives
307 # Better: use the comparison file which has npub, and match against announcements
308
309 # Let's match on identifier only for now (simpler, may have minor false positives)
310 # Extract just the identifiers from missing announcements
311 local missing_identifiers_file
312 missing_identifiers_file=$(mktemp)
313 cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file"
314
315 # Filter event IDs to only those with matching identifiers
316 while IFS='|' read -r event_id identifier pubkey_hex; do
317 if grep -qFx "$identifier" "$missing_identifiers_file"; then
318 echo "$event_id"
319 fi
320 done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file"
321
322 local event_id_count
323 event_id_count=$(wc -l < "$missing_event_ids_file")
324 event_id_count="${event_id_count//[^0-9]/}"
325 log_info " Found $event_id_count event IDs for missing announcements"
326
327 # Step 3: Filter parse failures to only those event IDs
328 local filtered_file
329 filtered_file=$(mktemp)
330
331 # Copy header lines
332 grep '^#' "$parse_failures_file" > "$filtered_file"
333
334 # Add a note about filtering
335 echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file"
336 echo "# Analysis root: $analysis_root" >> "$filtered_file"
337 echo "# Missing announcements: $missing_count" >> "$filtered_file"
338 echo "# Matching event IDs: $event_id_count" >> "$filtered_file"
339
340 # Filter data lines - only include if event_id is in our list
341 local filtered_count=0
342 while IFS=$'\t' read -r event_id kind reason repo npub; do
343 # Skip header lines (already copied)
344 [[ "$event_id" =~ ^# ]] && continue
345
346 # Check if this event_id is in our missing list
347 if grep -qFx "$event_id" "$missing_event_ids_file"; then
348 printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file"
349 filtered_count=$((filtered_count + 1))
350 fi
351 done < "$parse_failures_file"
352
353 # Replace original with filtered version
354 mv "$filtered_file" "$parse_failures_file"
355
356 # Cleanup temp files
357 rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file"
358
359 log_info " Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures"
360 log_success "Filtered to parse failures for missing announcements only"
361}
362
207# Main 363# Main
208main() { 364main() {
209 if [[ $# -lt 2 ]]; then 365 if [[ $# -lt 2 ]]; then
@@ -219,6 +375,7 @@ main() {
219 since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") 375 since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "")
220 local until_date="" 376 local until_date=""
221 local dry_run=false 377 local dry_run=false
378 local analysis_root=""
222 379
223 # Parse options 380 # Parse options
224 while [[ $# -gt 0 ]]; do 381 while [[ $# -gt 0 ]]; do
@@ -231,6 +388,10 @@ main() {
231 until_date="$2" 388 until_date="$2"
232 shift 2 389 shift 2
233 ;; 390 ;;
391 --analysis-root)
392 analysis_root="$2"
393 shift 2
394 ;;
234 --dry-run) 395 --dry-run)
235 dry_run=true 396 dry_run=true
236 shift 397 shift
@@ -469,6 +630,11 @@ main() {
469 grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" 630 grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file"
470 mv "$deduped_file" "$output_file" 631 mv "$deduped_file" "$output_file"
471 632
633 # Filter to missing announcements only if analysis root provided
634 if [[ -n "$analysis_root" ]]; then
635 filter_to_missing_announcements "$output_file" "$analysis_root"
636 fi
637
472 # Count final entries (excluding header lines) 638 # Count final entries (excluding header lines)
473 local count 639 local count
474 count=$(grep -v '^#' "$output_file" | wc -l) 640 count=$(grep -v '^#' "$output_file" | wc -l)
@@ -482,9 +648,15 @@ main() {
482 log_info "=== Extraction Summary ===" 648 log_info "=== Extraction Summary ==="
483 log_info "Service: $service" 649 log_info "Service: $service"
484 log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" 650 log_info "Time range: ${since_date:-beginning} to ${until_date:-now}"
651 if [[ -n "$analysis_root" ]]; then
652 log_info "Filtered to: missing announcements only"
653 fi
485 log_success "Extracted $count total entries" 654 log_success "Extracted $count total entries"
486 log_info " - [PARSE_FAIL] entries: $parse_fail_count" 655 log_info " - [PARSE_FAIL] entries: $parse_fail_count"
487 log_info " - Invalid announcement rejections: $invalid_announcement_count" 656 log_info " - Invalid announcement rejections: $invalid_announcement_count"
657 if [[ -n "$analysis_root" ]]; then
658 log_info " (filtered from original extraction)"
659 fi
488 echo "" 660 echo ""
489 log_info "Output file: $output_file" 661 log_info "Output file: $output_file"
490 662