upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/docs/how-to/migration-scripts/30-extract-parse-failures.sh
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-27 11:17:59 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-27 20:38:21 +0000
commit49b0df788255848173c01db394a2df29b7c08576 (patch)
treec67c1f6b1e6b11e8d3b58393f3d429948b8849d4 /docs/how-to/migration-scripts/30-extract-parse-failures.sh
parentddcba2b350615e6d6ad7028b570206efb42f0338 (diff)
refactor: simplify parse failure enrichment using log entries and nak
Remove --analysis-root flag and external data file dependencies. The script now extracts repo/npub information directly from 'Added rejected announcement' log entries (which include pubkey and identifier fields) and uses `nak encode npub <hex-pubkey>` to convert hex pubkeys to npub format. This simplification was enabled by the recent logging improvement that added pubkey to the 'Added rejected announcement' log entries.
Diffstat (limited to 'docs/how-to/migration-scripts/30-extract-parse-failures.sh')
-rwxr-xr-xdocs/how-to/migration-scripts/30-extract-parse-failures.sh332
1 files changed, 94 insertions, 238 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index f86e9f8..d7f9706 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -24,6 +24,12 @@
24# --until <date> End date for log extraction (default: now) 24# --until <date> End date for log extraction (default: now)
25# --dry-run Show what would be extracted without writing files 25# --dry-run Show what would be extracted without writing files
26# 26#
27# ENRICHMENT:
28# The script automatically enriches parse failures with repo/npub information
29# by extracting from "Added rejected announcement" log entries which include
30# pubkey and identifier fields. Hex pubkeys are converted to npub format using
31# `nak encode npub <hex-pubkey>` if the nak tool is available.
32#
27# OUTPUT: 33# OUTPUT:
28# <output-dir>/parse-failures.txt 34# <output-dir>/parse-failures.txt
29# 35#
@@ -31,7 +37,7 @@
31# event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub 37# event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
32# 38#
33# EXPECTED LOG FORMATS: 39# EXPECTED LOG FORMATS:
34# The script looks for two types of log entries: 40# The script looks for three types of log entries:
35# 41#
36# 1. Structured [PARSE_FAIL] entries: 42# 1. Structured [PARSE_FAIL] entries:
37# 2026-01-22T10:30:45Z ngit-grasp[1234]: [PARSE_FAIL] kind=30618 event_id=abc123... reason="invalid refs format" repo=myrepo npub=npub1... 43# 2026-01-22T10:30:45Z ngit-grasp[1234]: [PARSE_FAIL] kind=30618 event_id=abc123... reason="invalid refs format" repo=myrepo npub=npub1...
@@ -39,13 +45,17 @@
39# 2. "Invalid announcement" rejections (write policy): 45# 2. "Invalid announcement" rejections (write policy):
40# Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... 46# Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found...
41# 47#
48# 3. "Added rejected announcement" entries (for enrichment):
49# Added rejected announcement to two-tier index event_id=abc123... kind=30617 identifier=myrepo pubkey=hex...
50# These entries provide pubkey and identifier for enriching write policy rejections.
51#
42# NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted 52# NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted
43# because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting 53# because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting
44# both would cause double-counting since deduplication only works within each format. 54# both would cause double-counting since deduplication only works within each format.
45# Write policy logs contain the same events, so we don't lose any data. 55# Write policy logs contain the same events, so we don't lose any data.
46# 56#
47# Required fields: kind, event_id, reason 57# Required fields: kind, event_id, reason
48# Optional fields: repo, npub (may not be available for all entry types) 58# Enrichment fields: repo (identifier), npub (converted from hex pubkey)
49# 59#
50# DEPENDENCY: 60# DEPENDENCY:
51# This script requires logging improvements in ngit-grasp to emit structured 61# This script requires logging improvements in ngit-grasp to emit structured
@@ -127,23 +137,21 @@ usage() {
127 echo "Options:" 137 echo "Options:"
128 echo " --since <date> Start date (default: 30 days ago)" 138 echo " --since <date> Start date (default: 30 days ago)"
129 echo " --until <date> End date (default: now)" 139 echo " --until <date> End date (default: now)"
130 echo " --analysis-root <dir> Filter to only missing announcements from analysis"
131 echo " --dry-run Show what would be extracted without writing" 140 echo " --dry-run Show what would be extracted without writing"
132 echo "" 141 echo ""
133 echo "Examples:" 142 echo "Examples:"
134 echo " $0 ngit-grasp.service output/logs" 143 echo " $0 ngit-grasp.service output/logs"
135 echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" 144 echo " $0 ngit-grasp.service output/logs --since '2026-01-01'"
136 echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" 145 echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'"
137 echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123"
138 echo "" 146 echo ""
139 echo "Expected log formats:" 147 echo "Expected log formats:"
140 echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." 148 echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..."
141 echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." 149 echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..."
142 echo "" 150 echo ""
143 echo "Filtering with --analysis-root:" 151 echo "Enrichment:"
144 echo " When provided, only parse failures for announcements that are in production" 152 echo " Parse failures are automatically enriched with repo/npub from"
145 echo " but missing from the archive will be included. This filters out rejections" 153 echo " 'Added rejected announcement' log entries. Hex pubkeys are converted"
146 echo " for events from other relays that don't affect the migration." 154 echo " to npub format using 'nak encode npub' if available."
147 exit 1 155 exit 1
148} 156}
149 157
@@ -211,96 +219,52 @@ parse_write_policy_rejection_line() {
211# the same event to be counted twice. Write policy logs contain the same 219# the same event to be counted twice. Write policy logs contain the same
212# events, so we don't lose any data by only extracting from that source. 220# events, so we don't lose any data by only extracting from that source.
213 221
214# Enrich parse failures with repo/npub by looking up event_id in announcements.json 222# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries
215# This is critical because "Invalid announcement" rejections only log event_id and kind, 223# This is critical because "Invalid announcement" rejections only log event_id and kind,
216# not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead 224# not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead
217# of repo|npub in action-required.txt, making the output unusable. 225# of repo|npub in action-required.txt, making the output unusable.
218# 226#
219# Arguments: 227# Arguments:
220# $1 - parse failures file to enrich (modified in place) 228# $1 - parse failures file to enrich (modified in place)
221# $2 - analysis root directory containing prod/raw/announcements.json 229# $2 - lookup file containing event_id -> identifier|pubkey mappings from logs
222# 230#
223# The function: 231# The function:
224# 1. Builds a lookup table from announcements.json: event_id -> repo|npub 232# 1. Uses the lookup table built from "Added rejected announcement" log entries
225# 2. For each parse failure with empty repo/npub, looks up the event_id 233# 2. For each parse failure with empty repo/npub, looks up the event_id
226# 3. Populates repo and npub columns from the lookup 234# 3. Populates repo and npub columns from the lookup
235# 4. Converts hex pubkeys to npub format using `nak encode npub` if available
227enrich_with_repo_npub() { 236enrich_with_repo_npub() {
228 local parse_failures_file="$1" 237 local parse_failures_file="$1"
229 local analysis_root="$2" 238 local lookup_file="$2"
230
231 local prod_announcements="$analysis_root/prod/raw/announcements.json"
232
233 # Validate required file exists
234 if [[ ! -f "$prod_announcements" ]]; then
235 log_warn "Production announcements file not found: $prod_announcements"
236 log_warn "Skipping enrichment - repo/npub columns will remain empty"
237 return 0
238 fi
239 239
240 # Check if jq is available 240 # Validate lookup file exists and has content
241 if ! command -v jq &> /dev/null; then 241 if [[ ! -f "$lookup_file" ]] || [[ ! -s "$lookup_file" ]]; then
242 log_warn "jq not found - cannot enrich parse failures with repo/npub" 242 log_warn "No enrichment data available - repo/npub columns will remain empty"
243 log_warn "Install jq or run without --analysis-root"
244 return 0 243 return 0
245 fi 244 fi
246 245
247 log_info "Enriching parse failures with repo/npub from announcements..." 246 log_info "Enriching parse failures with repo/npub from log entries..."
248
249 # Step 1: Build lookup table from announcements.json
250 # Output format: event_id<TAB>repo<TAB>npub
251 local lookup_file
252 lookup_file=$(mktemp)
253
254 # Extract id, d-tag (repo identifier), and pubkey from announcements
255 # Convert pubkey to npub using bech32 encoding
256 # Note: We use a simple hex-to-npub conversion via external tool if available,
257 # otherwise we'll use the hex pubkey (Phase 5 can still match on it)
258 log_info " Building event_id -> repo/npub lookup table..."
259
260 # First, extract the raw data: id, d-tag, pubkey (hex)
261 jq -r 'select(.kind == 30617) |
262 .id as $id |
263 .pubkey as $pubkey |
264 ((.tags[] | select(.[0] == "d") | .[1]) // "") as $dtag |
265 "\($id)\t\($dtag)\t\($pubkey)"' "$prod_announcements" > "$lookup_file.raw" 2>/dev/null || {
266 log_warn "Failed to parse production announcements JSON"
267 rm -f "$lookup_file" "$lookup_file.raw"
268 return 0
269 }
270 247
271 # Convert hex pubkeys to npub format 248 # Check if we have nak for pubkey->npub conversion
272 # Check if we have a tool to do bech32 encoding (nak, nostr-tool, etc.)
273 local can_convert_npub=false 249 local can_convert_npub=false
274 if command -v nak &> /dev/null; then 250 if command -v nak &> /dev/null; then
275 can_convert_npub=true 251 can_convert_npub=true
276 log_info " Using 'nak' for pubkey->npub conversion" 252 log_info " Using 'nak' for pubkey->npub conversion"
253 else
254 log_warn " 'nak' not found - will use hex pubkeys instead of npub"
277 fi 255 fi
278 256
279 # Process the lookup file, converting pubkeys to npubs if possible
280 while IFS=$'\t' read -r event_id repo pubkey_hex; do
281 local npub
282 if [[ "$can_convert_npub" == true && -n "$pubkey_hex" ]]; then
283 # Use nak to encode pubkey as npub
284 npub=$(nak encode npub "$pubkey_hex" 2>/dev/null || echo "")
285 fi
286 # Fall back to hex pubkey if conversion failed
287 [[ -z "$npub" ]] && npub="$pubkey_hex"
288 printf '%s\t%s\t%s\n' "$event_id" "$repo" "$npub"
289 done < "$lookup_file.raw" > "$lookup_file"
290
291 rm -f "$lookup_file.raw"
292
293 local lookup_count 257 local lookup_count
294 lookup_count=$(wc -l < "$lookup_file") 258 lookup_count=$(wc -l < "$lookup_file")
295 lookup_count="${lookup_count//[^0-9]/}" 259 lookup_count="${lookup_count//[^0-9]/}"
296 log_info " Built lookup table with $lookup_count announcements" 260 log_info " Lookup table has $lookup_count entries"
297 261
298 # Step 2: Enrich parse failures 262 # Enrich parse failures
299 local enriched_file 263 local enriched_file
300 enriched_file=$(mktemp) 264 enriched_file=$(mktemp)
301 265
302 # Copy header lines 266 # Copy header lines
303 grep '^#' "$parse_failures_file" > "$enriched_file" 267 grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true
304 268
305 # Process data lines 269 # Process data lines
306 local enriched_count=0 270 local enriched_count=0
@@ -317,14 +281,21 @@ enrich_with_repo_npub() {
317 continue 281 continue
318 fi 282 fi
319 283
320 # Look up event_id in our table 284 # Look up event_id in our table (format: event_id<TAB>identifier<TAB>pubkey_hex)
321 local lookup_result 285 local lookup_result
322 lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "") 286 lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "")
323 287
324 if [[ -n "$lookup_result" ]]; then 288 if [[ -n "$lookup_result" ]]; then
325 local looked_up_repo looked_up_npub 289 local looked_up_repo looked_up_pubkey_hex looked_up_npub
326 looked_up_repo=$(echo "$lookup_result" | cut -f2) 290 looked_up_repo=$(echo "$lookup_result" | cut -f2)
327 looked_up_npub=$(echo "$lookup_result" | cut -f3) 291 looked_up_pubkey_hex=$(echo "$lookup_result" | cut -f3)
292
293 # Convert hex pubkey to npub if nak is available
294 if [[ "$can_convert_npub" == true && -n "$looked_up_pubkey_hex" ]]; then
295 looked_up_npub=$(nak encode npub "$looked_up_pubkey_hex" 2>/dev/null || echo "$looked_up_pubkey_hex")
296 else
297 looked_up_npub="$looked_up_pubkey_hex"
298 fi
328 299
329 # Use looked-up values if original was empty 300 # Use looked-up values if original was empty
330 [[ -z "$repo" ]] && repo="$looked_up_repo" 301 [[ -z "$repo" ]] && repo="$looked_up_repo"
@@ -338,160 +309,31 @@ enrich_with_repo_npub() {
338 # Replace original with enriched version 309 # Replace original with enriched version
339 mv "$enriched_file" "$parse_failures_file" 310 mv "$enriched_file" "$parse_failures_file"
340 311
341 # Cleanup
342 rm -f "$lookup_file"
343
344 log_info " Enriched $enriched_count of $total_count parse failures with repo/npub" 312 log_info " Enriched $enriched_count of $total_count parse failures with repo/npub"
345 log_success "Enrichment complete" 313 log_success "Enrichment complete"
346} 314}
347 315
348# Filter parse failures to only those for missing announcements 316# Parse "Added rejected announcement" log entries to build enrichment lookup table
349# This is used when --analysis-root is provided to scope results to the migration 317# Input: log line containing "Added rejected announcement to two-tier index"
350# 318# Output: TSV line: event_id<TAB>identifier<TAB>pubkey_hex
351# Arguments: 319parse_rejected_announcement_line() {
352# $1 - parse failures file to filter (modified in place) 320 local line="$1"
353# $2 - analysis root directory containing comparison/ and prod/ subdirs
354#
355# The function:
356# 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt
357# 2. Extracts pubkey/identifier pairs for those announcements
358# 3. Reads production announcements from prod/raw/announcements.json
359# 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs
360# 5. Filters parse failures to only those event IDs
361filter_to_missing_announcements() {
362 local parse_failures_file="$1"
363 local analysis_root="$2"
364
365 local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt"
366 local prod_announcements="$analysis_root/prod/raw/announcements.json"
367
368 # Validate required files exist
369 if [[ ! -f "$missing_file" ]]; then
370 log_warn "Missing announcements file not found: $missing_file"
371 log_warn "Skipping filter - all parse failures will be included"
372 return 0
373 fi
374
375 if [[ ! -f "$prod_announcements" ]]; then
376 log_warn "Production announcements file not found: $prod_announcements"
377 log_warn "Skipping filter - all parse failures will be included"
378 return 0
379 fi
380
381 # Check if jq is available
382 if ! command -v jq &> /dev/null; then
383 log_warn "jq not found - cannot filter parse failures"
384 log_warn "Install jq or run without --analysis-root"
385 return 0
386 fi
387
388 log_info "Filtering parse failures to missing announcements only..."
389
390 # Step 1: Extract pubkey/identifier pairs from missing announcements
391 # Format: identifier | npub | prod=complete | archive=missing
392 local missing_pairs_file
393 missing_pairs_file=$(mktemp)
394
395 # Extract identifier and npub, convert npub to hex pubkey for matching
396 while IFS=' | ' read -r identifier npub rest; do
397 # Skip empty lines
398 [[ -z "$identifier" ]] && continue
399 # Trim whitespace
400 identifier=$(echo "$identifier" | xargs)
401 npub=$(echo "$npub" | xargs)
402 echo "${identifier}|${npub}"
403 done < "$missing_file" > "$missing_pairs_file"
404
405 local missing_count
406 missing_count=$(wc -l < "$missing_pairs_file")
407 missing_count="${missing_count//[^0-9]/}"
408 log_info " Found $missing_count missing announcements to filter for"
409
410 # Step 2: Get event IDs from production announcements for these pairs
411 # We need to match on 'd' tag (identifier) and pubkey
412 local missing_event_ids_file
413 missing_event_ids_file=$(mktemp)
414
415 # Create a lookup of identifier|npub -> event_id from production announcements
416 # The JSON has: id, pubkey (hex), tags (array with ["d", identifier])
417 log_info " Extracting event IDs from production announcements..."
418
419 # Use jq to extract id, pubkey, and d-tag value, then filter
420 # Output format: event_id|identifier|pubkey_hex
421 # Note: The JSON file is NDJSON (newline-delimited), not an array
422 jq -r 'select(.kind == 30617) |
423 .id as $id |
424 .pubkey as $pubkey |
425 (.tags[] | select(.[0] == "d") | .[1]) as $dtag |
426 "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || {
427 log_warn "Failed to parse production announcements JSON"
428 rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all"
429 return 0
430 }
431
432 # Now filter to only event IDs for missing announcements
433 # We need to convert npub to hex pubkey for comparison
434 # npub is bech32, pubkey in JSON is hex
435 # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey
436 # Actually, we need both because same identifier can exist for different pubkeys
437
438 # Create a set of "identifier|pubkey_hex" to match against
439 # First, we need to convert npub to hex - but that requires a tool
440 # Alternative: match on identifier only and accept some false positives
441 # Better: use the comparison file which has npub, and match against announcements
442
443 # Let's match on identifier only for now (simpler, may have minor false positives)
444 # Extract just the identifiers from missing announcements
445 local missing_identifiers_file
446 missing_identifiers_file=$(mktemp)
447 cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file"
448
449 # Filter event IDs to only those with matching identifiers
450 while IFS='|' read -r event_id identifier pubkey_hex; do
451 if grep -qFx "$identifier" "$missing_identifiers_file"; then
452 echo "$event_id"
453 fi
454 done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file"
455
456 local event_id_count
457 event_id_count=$(wc -l < "$missing_event_ids_file")
458 event_id_count="${event_id_count//[^0-9]/}"
459 log_info " Found $event_id_count event IDs for missing announcements"
460
461 # Step 3: Filter parse failures to only those event IDs
462 local filtered_file
463 filtered_file=$(mktemp)
464
465 # Copy header lines
466 grep '^#' "$parse_failures_file" > "$filtered_file"
467 321
468 # Add a note about filtering 322 local event_id identifier pubkey_hex
469 echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file"
470 echo "# Analysis root: $analysis_root" >> "$filtered_file"
471 echo "# Missing announcements: $missing_count" >> "$filtered_file"
472 echo "# Matching event IDs: $event_id_count" >> "$filtered_file"
473 323
474 # Filter data lines - only include if event_id is in our list 324 # Extract event_id=VALUE (hex string)
475 local filtered_count=0 325 event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
476 while IFS=$'\t' read -r event_id kind reason repo npub; do
477 # Skip header lines (already copied)
478 [[ "$event_id" =~ ^# ]] && continue
479
480 # Check if this event_id is in our missing list
481 if grep -qFx "$event_id" "$missing_event_ids_file"; then
482 printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file"
483 filtered_count=$((filtered_count + 1))
484 fi
485 done < "$parse_failures_file"
486 326
487 # Replace original with filtered version 327 # Extract identifier=VALUE (repo name)
488 mv "$filtered_file" "$parse_failures_file" 328 identifier=$(echo "$line" | grep -oP 'identifier=\K[^ ]+' || echo "")
489 329
490 # Cleanup temp files 330 # Extract pubkey=VALUE (hex string)
491 rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file" 331 pubkey_hex=$(echo "$line" | grep -oP 'pubkey=\K[a-f0-9]+' || echo "")
492 332
493 log_info " Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures" 333 # Only output if we have all required fields
494 log_success "Filtered to parse failures for missing announcements only" 334 if [[ -n "$event_id" && -n "$identifier" && -n "$pubkey_hex" ]]; then
335 printf '%s\t%s\t%s\n' "$event_id" "$identifier" "$pubkey_hex"
336 fi
495} 337}
496 338
497# Main 339# Main
@@ -509,7 +351,6 @@ main() {
509 since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") 351 since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "")
510 local until_date="" 352 local until_date=""
511 local dry_run=false 353 local dry_run=false
512 local analysis_root=""
513 354
514 # Parse options 355 # Parse options
515 while [[ $# -gt 0 ]]; do 356 while [[ $# -gt 0 ]]; do
@@ -522,10 +363,6 @@ main() {
522 until_date="$2" 363 until_date="$2"
523 shift 2 364 shift 2
524 ;; 365 ;;
525 --analysis-root)
526 analysis_root="$2"
527 shift 2
528 ;;
529 --dry-run) 366 --dry-run)
530 dry_run=true 367 dry_run=true
531 shift 368 shift
@@ -640,10 +477,11 @@ main() {
640 log_info "Extracting log entries..." 477 log_info "Extracting log entries..."
641 478
642 # Create temp files for intermediate results 479 # Create temp files for intermediate results
643 local temp_stderr temp_parse_fail temp_write_policy_rejection 480 local temp_stderr temp_parse_fail temp_write_policy_rejection temp_rejected_announcement
644 temp_stderr=$(mktemp) 481 temp_stderr=$(mktemp)
645 temp_parse_fail=$(mktemp) 482 temp_parse_fail=$(mktemp)
646 temp_write_policy_rejection=$(mktemp) 483 temp_write_policy_rejection=$(mktemp)
484 temp_rejected_announcement=$(mktemp)
647 485
648 # Extract [PARSE_FAIL] entries directly to temp file (streaming) 486 # Extract [PARSE_FAIL] entries directly to temp file (streaming)
649 log_info " Searching for [PARSE_FAIL] entries..." 487 log_info " Searching for [PARSE_FAIL] entries..."
@@ -661,17 +499,25 @@ main() {
661 log_info " Searching for write policy rejections..." 499 log_info " Searching for write policy rejections..."
662 eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true 500 eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true
663 501
502 # Extract "Added rejected announcement" entries for enrichment (streaming)
503 # These contain pubkey and identifier which we use to enrich write policy rejections
504 log_info " Searching for rejected announcement entries (for enrichment)..."
505 eval "$journal_cmd" 2>/dev/null | grep 'Added rejected announcement to two-tier index' > "$temp_rejected_announcement" || true
506
664 rm -f "$temp_stderr" 507 rm -f "$temp_stderr"
665 508
666 # Check if we found anything 509 # Check if we found anything
667 local parse_fail_line_count write_policy_line_count 510 local parse_fail_line_count write_policy_line_count rejected_announcement_line_count
668 parse_fail_line_count=$(wc -l < "$temp_parse_fail") 511 parse_fail_line_count=$(wc -l < "$temp_parse_fail")
669 parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" 512 parse_fail_line_count="${parse_fail_line_count//[^0-9]/}"
670 write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") 513 write_policy_line_count=$(wc -l < "$temp_write_policy_rejection")
671 write_policy_line_count="${write_policy_line_count//[^0-9]/}" 514 write_policy_line_count="${write_policy_line_count//[^0-9]/}"
515 rejected_announcement_line_count=$(wc -l < "$temp_rejected_announcement")
516 rejected_announcement_line_count="${rejected_announcement_line_count//[^0-9]/}"
672 517
673 log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" 518 log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines"
674 log_info " Found $write_policy_line_count write policy rejection log lines" 519 log_info " Found $write_policy_line_count write policy rejection log lines"
520 log_info " Found $rejected_announcement_line_count rejected announcement log lines (for enrichment)"
675 521
676 local total_invalid_announcement_lines=$write_policy_line_count 522 local total_invalid_announcement_lines=$write_policy_line_count
677 523
@@ -704,7 +550,7 @@ main() {
704 echo "# This is expected if ngit-grasp logging improvements are not yet deployed." 550 echo "# This is expected if ngit-grasp logging improvements are not yet deployed."
705 } > "$output_file" 551 } > "$output_file"
706 552
707 rm -f "$temp_parse_fail" "$temp_write_policy_rejection" 553 rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"
708 log_info "Created empty output file: $output_file" 554 log_info "Created empty output file: $output_file"
709 exit 0 555 exit 0
710 fi 556 fi
@@ -753,7 +599,22 @@ main() {
753 599
754 local invalid_announcement_count=$write_policy_count 600 local invalid_announcement_count=$write_policy_count
755 601
756 rm -f "$temp_parse_fail" "$temp_write_policy_rejection" 602 # Build enrichment lookup table from "Added rejected announcement" entries
603 local enrichment_lookup_file
604 enrichment_lookup_file=$(mktemp)
605
606 log_info " Building enrichment lookup table..."
607 if [[ "$rejected_announcement_line_count" -gt 0 ]]; then
608 while IFS= read -r line; do
609 local parsed
610 parsed=$(parse_rejected_announcement_line "$line")
611 if [[ -n "$parsed" ]]; then
612 echo "$parsed" >> "$enrichment_lookup_file"
613 fi
614 done < "$temp_rejected_announcement"
615 fi
616
617 rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"
757 618
758 # Deduplicate by event_id (first column) - keep first occurrence 619 # Deduplicate by event_id (first column) - keep first occurrence
759 log_info " Deduplicating entries..." 620 log_info " Deduplicating entries..."
@@ -764,17 +625,18 @@ main() {
764 grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" 625 grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file"
765 mv "$deduped_file" "$output_file" 626 mv "$deduped_file" "$output_file"
766 627
767 # Enrich with repo/npub from announcements.json if analysis root provided 628 # Deduplicate enrichment lookup table by event_id
629 if [[ -s "$enrichment_lookup_file" ]]; then
630 sort -t$'\t' -k1,1 -u "$enrichment_lookup_file" > "$enrichment_lookup_file.deduped"
631 mv "$enrichment_lookup_file.deduped" "$enrichment_lookup_file"
632 fi
633
634 # Enrich with repo/npub from "Added rejected announcement" log entries
768 # This is critical for usability - without it, action-required.txt shows 635 # This is critical for usability - without it, action-required.txt shows
769 # event_id|kind instead of repo|npub, making parse failures unidentifiable 636 # event_id|kind instead of repo|npub, making parse failures unidentifiable
770 if [[ -n "$analysis_root" ]]; then 637 enrich_with_repo_npub "$output_file" "$enrichment_lookup_file"
771 enrich_with_repo_npub "$output_file" "$analysis_root"
772 fi
773 638
774 # Filter to missing announcements only if analysis root provided 639 rm -f "$enrichment_lookup_file"
775 if [[ -n "$analysis_root" ]]; then
776 filter_to_missing_announcements "$output_file" "$analysis_root"
777 fi
778 640
779 # Count final entries (excluding header lines) 641 # Count final entries (excluding header lines)
780 local count 642 local count
@@ -789,15 +651,9 @@ main() {
789 log_info "=== Extraction Summary ===" 651 log_info "=== Extraction Summary ==="
790 log_info "Service: $service" 652 log_info "Service: $service"
791 log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" 653 log_info "Time range: ${since_date:-beginning} to ${until_date:-now}"
792 if [[ -n "$analysis_root" ]]; then
793 log_info "Filtered to: missing announcements only"
794 fi
795 log_success "Extracted $count total entries" 654 log_success "Extracted $count total entries"
796 log_info " - [PARSE_FAIL] entries: $parse_fail_count" 655 log_info " - [PARSE_FAIL] entries: $parse_fail_count"
797 log_info " - Invalid announcement rejections: $invalid_announcement_count" 656 log_info " - Invalid announcement rejections: $invalid_announcement_count"
798 if [[ -n "$analysis_root" ]]; then
799 log_info " (filtered from original extraction)"
800 fi
801 echo "" 657 echo ""
802 log_info "Output file: $output_file" 658 log_info "Output file: $output_file"
803 659