1 files changed, 175 insertions, 3 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index 7870c61..f821834 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -125,18 +125,25 @@ usage() {
    echo "  output-dir    Directory to store extracted log data"
    echo ""
    echo "Options:"
-    echo "  --since <date>   Start date (default: 30 days ago)"
+    echo "  --since <date>        Start date (default: 30 days ago)"
-    echo "  --until <date>   End date (default: now)"
+    echo "  --until <date>        End date (default: now)"
-    echo "  --dry-run        Show what would be extracted without writing"
+    echo "  --analysis-root <dir> Filter to only missing announcements from analysis"
+    echo "  --dry-run             Show what would be extracted without writing"
    echo ""
    echo "Examples:"
    echo "  $0 ngit-grasp.service output/logs"
    echo "  $0 ngit-grasp.service output/logs --since '2026-01-01'"
    echo "  $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'"
+    echo "  $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123"
    echo ""
    echo "Expected log formats:"
    echo "  [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..."
    echo "  Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..."
+    echo ""
+    echo "Filtering with --analysis-root:"
+    echo "  When provided, only parse failures for announcements that are in production"
+    echo "  but missing from the archive will be included. This filters out rejections"
+    echo "  for events from other relays that don't affect the migration."
    exit 1
 }
@@ -204,6 +211,155 @@ parse_write_policy_rejection_line() {
 # the same event to be counted twice. Write policy logs contain the same
 # events, so we don't lose any data by only extracting from that source.
+# Filter parse failures to only those for missing announcements
+# This is used when --analysis-root is provided to scope results to the migration
+#
+# Arguments:
+#   $1 - parse failures file to filter (modified in place)
+#   $2 - analysis root directory containing comparison/ and prod/ subdirs
+#
+# The function:
+#   1. Reads missing announcements from comparison/complete-prod-missing-archive.txt
+#   2. Extracts pubkey/identifier pairs for those announcements
+#   3. Reads production announcements from prod/raw/announcements.json
+#   4. Gets event IDs for announcements matching the missing pubkey/identifier pairs
+#   5. Filters parse failures to only those event IDs
+filter_to_missing_announcements() {
+    local parse_failures_file="$1"
+    local analysis_root="$2"
+    
+    local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt"
+    local prod_announcements="$analysis_root/prod/raw/announcements.json"
+    
+    # Validate required files exist
+    if [[ ! -f "$missing_file" ]]; then
+        log_warn "Missing announcements file not found: $missing_file"
+        log_warn "Skipping filter - all parse failures will be included"
+        return 0
+    fi
+    
+    if [[ ! -f "$prod_announcements" ]]; then
+        log_warn "Production announcements file not found: $prod_announcements"
+        log_warn "Skipping filter - all parse failures will be included"
+        return 0
+    fi
+    
+    # Check if jq is available
+    if ! command -v jq &> /dev/null; then
+        log_warn "jq not found - cannot filter parse failures"
+        log_warn "Install jq or run without --analysis-root"
+        return 0
+    fi
+    
+    log_info "Filtering parse failures to missing announcements only..."
+    
+    # Step 1: Extract pubkey/identifier pairs from missing announcements
+    # Format: identifier | npub | prod=complete | archive=missing
+    local missing_pairs_file
+    missing_pairs_file=$(mktemp)
+    
+    # Extract identifier and npub, convert npub to hex pubkey for matching
+    while IFS=' | ' read -r identifier npub rest; do
+        # Skip empty lines
+        [[ -z "$identifier" ]] && continue
+        # Trim whitespace
+        identifier=$(echo "$identifier" | xargs)
+        npub=$(echo "$npub" | xargs)
+        echo "${identifier}|${npub}"
+    done < "$missing_file" > "$missing_pairs_file"
+    
+    local missing_count
+    missing_count=$(wc -l < "$missing_pairs_file")
+    missing_count="${missing_count//[^0-9]/}"
+    log_info "  Found $missing_count missing announcements to filter for"
+    
+    # Step 2: Get event IDs from production announcements for these pairs
+    # We need to match on 'd' tag (identifier) and pubkey
+    local missing_event_ids_file
+    missing_event_ids_file=$(mktemp)
+    
+    # Create a lookup of identifier|npub -> event_id from production announcements
+    # The JSON has: id, pubkey (hex), tags (array with ["d", identifier])
+    log_info "  Extracting event IDs from production announcements..."
+    
+    # Use jq to extract id, pubkey, and d-tag value, then filter
+    # Output format: event_id|identifier|pubkey_hex
+    # Note: The JSON file is NDJSON (newline-delimited), not an array
+    jq -r 'select(.kind == 30617) | 
+        .id as $id | 
+        .pubkey as $pubkey |
+        (.tags[] | select(.[0] == "d") | .[1]) as $dtag |
+        "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || {
+        log_warn "Failed to parse production announcements JSON"
+        rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all"
+        return 0
+    }
+    
+    # Now filter to only event IDs for missing announcements
+    # We need to convert npub to hex pubkey for comparison
+    # npub is bech32, pubkey in JSON is hex
+    # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey
+    # Actually, we need both because same identifier can exist for different pubkeys
+    
+    # Create a set of "identifier|pubkey_hex" to match against
+    # First, we need to convert npub to hex - but that requires a tool
+    # Alternative: match on identifier only and accept some false positives
+    # Better: use the comparison file which has npub, and match against announcements
+    
+    # Let's match on identifier only for now (simpler, may have minor false positives)
+    # Extract just the identifiers from missing announcements
+    local missing_identifiers_file
+    missing_identifiers_file=$(mktemp)
+    cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file"
+    
+    # Filter event IDs to only those with matching identifiers
+    while IFS='|' read -r event_id identifier pubkey_hex; do
+        if grep -qFx "$identifier" "$missing_identifiers_file"; then
+            echo "$event_id"
+        fi
+    done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file"
+    
+    local event_id_count
+    event_id_count=$(wc -l < "$missing_event_ids_file")
+    event_id_count="${event_id_count//[^0-9]/}"
+    log_info "  Found $event_id_count event IDs for missing announcements"
+    
+    # Step 3: Filter parse failures to only those event IDs
+    local filtered_file
+    filtered_file=$(mktemp)
+    
+    # Copy header lines
+    grep '^#' "$parse_failures_file" > "$filtered_file"
+    
+    # Add a note about filtering
+    echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file"
+    echo "# Analysis root: $analysis_root" >> "$filtered_file"
+    echo "# Missing announcements: $missing_count" >> "$filtered_file"
+    echo "# Matching event IDs: $event_id_count" >> "$filtered_file"
+    
+    # Filter data lines - only include if event_id is in our list
+    local filtered_count=0
+    while IFS=$'\t' read -r event_id kind reason repo npub; do
+        # Skip header lines (already copied)
+        [[ "$event_id" =~ ^# ]] && continue
+        
+        # Check if this event_id is in our missing list
+        if grep -qFx "$event_id" "$missing_event_ids_file"; then
+            printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file"
+            filtered_count=$((filtered_count + 1))
+        fi
+    done < "$parse_failures_file"
+    
+    # Replace original with filtered version
+    mv "$filtered_file" "$parse_failures_file"
+    
+    # Cleanup temp files
+    rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file"
+    
+    log_info "  Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures"
+    log_success "Filtered to parse failures for missing announcements only"
+}
 # Main
 main() {
    if [[ $# -lt 2 ]]; then
@@ -219,6 +375,7 @@ main() {
    since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "")
    local until_date=""
    local dry_run=false
+    local analysis_root=""
    
    # Parse options
    while [[ $# -gt 0 ]]; do
@@ -231,6 +388,10 @@ main() {
                until_date="$2"
                shift 2
                ;;
+            --analysis-root)
+                analysis_root="$2"
+                shift 2
+                ;;
            --dry-run)
                dry_run=true
                shift
@@ -469,6 +630,11 @@ main() {
    grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file"
    mv "$deduped_file" "$output_file"
    
+    # Filter to missing announcements only if analysis root provided
+    if [[ -n "$analysis_root" ]]; then
+        filter_to_missing_announcements "$output_file" "$analysis_root"
+    fi
+    
    # Count final entries (excluding header lines)
    local count
    count=$(grep -v '^#' "$output_file" | wc -l)
@@ -482,9 +648,15 @@ main() {
    log_info "=== Extraction Summary ==="
    log_info "Service: $service"
    log_info "Time range: ${since_date:-beginning} to ${until_date:-now}"
+    if [[ -n "$analysis_root" ]]; then
+        log_info "Filtered to: missing announcements only"
+    fi
    log_success "Extracted $count total entries"
    log_info "  - [PARSE_FAIL] entries: $parse_fail_count"
    log_info "  - Invalid announcement rejections: $invalid_announcement_count"
+    if [[ -n "$analysis_root" ]]; then
+        log_info "  (filtered from original extraction)"
+    fi
    echo ""
    log_info "Output file: $output_file"

diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh index 7870c61..f821834 100755 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -125,18 +125,25 @@ usage() {
125	echo " output-dir Directory to store extracted log data"	125	echo " output-dir Directory to store extracted log data"
126	echo ""	126	echo ""
127	echo "Options:"	127	echo "Options:"
128	echo " --since <date> Start date (default: 30 days ago)"	128	echo " --since <date> Start date (default: 30 days ago)"
129	echo " --until <date> End date (default: now)"	129	echo " --until <date> End date (default: now)"
130	echo " --dry-run Show what would be extracted without writing"	130	echo " --analysis-root <dir> Filter to only missing announcements from analysis"
		131	echo " --dry-run Show what would be extracted without writing"
131	echo ""	132	echo ""
132	echo "Examples:"	133	echo "Examples:"
133	echo " $0 ngit-grasp.service output/logs"	134	echo " $0 ngit-grasp.service output/logs"
134	echo " $0 ngit-grasp.service output/logs --since '2026-01-01'"	135	echo " $0 ngit-grasp.service output/logs --since '2026-01-01'"
135	echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'"	136	echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'"
		137	echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123"
136	echo ""	138	echo ""
137	echo "Expected log formats:"	139	echo "Expected log formats:"
138	echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..."	140	echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..."
139	echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..."	141	echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..."
		142	echo ""
		143	echo "Filtering with --analysis-root:"
		144	echo " When provided, only parse failures for announcements that are in production"
		145	echo " but missing from the archive will be included. This filters out rejections"
		146	echo " for events from other relays that don't affect the migration."
140	exit 1	147	exit 1
141	}	148	}
142		149
@@ -204,6 +211,155 @@ parse_write_policy_rejection_line() {
204	# the same event to be counted twice. Write policy logs contain the same	211	# the same event to be counted twice. Write policy logs contain the same
205	# events, so we don't lose any data by only extracting from that source.	212	# events, so we don't lose any data by only extracting from that source.
206		213
		214	# Filter parse failures to only those for missing announcements
		215	# This is used when --analysis-root is provided to scope results to the migration
		216	#
		217	# Arguments:
		218	# $1 - parse failures file to filter (modified in place)
		219	# $2 - analysis root directory containing comparison/ and prod/ subdirs
		220	#
		221	# The function:
		222	# 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt
		223	# 2. Extracts pubkey/identifier pairs for those announcements
		224	# 3. Reads production announcements from prod/raw/announcements.json
		225	# 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs
		226	# 5. Filters parse failures to only those event IDs
		227	filter_to_missing_announcements() {
		228	local parse_failures_file="$1"
		229	local analysis_root="$2"
		230
		231	local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt"
		232	local prod_announcements="$analysis_root/prod/raw/announcements.json"
		233
		234	# Validate required files exist
		235	if [[ ! -f "$missing_file" ]]; then
		236	log_warn "Missing announcements file not found: $missing_file"
		237	log_warn "Skipping filter - all parse failures will be included"
		238	return 0
		239	fi
		240
		241	if [[ ! -f "$prod_announcements" ]]; then
		242	log_warn "Production announcements file not found: $prod_announcements"
		243	log_warn "Skipping filter - all parse failures will be included"
		244	return 0
		245	fi
		246
		247	# Check if jq is available
		248	if ! command -v jq &> /dev/null; then
		249	log_warn "jq not found - cannot filter parse failures"
		250	log_warn "Install jq or run without --analysis-root"
		251	return 0
		252	fi
		253
		254	log_info "Filtering parse failures to missing announcements only..."
		255
		256	# Step 1: Extract pubkey/identifier pairs from missing announcements
		257	# Format: identifier \| npub \| prod=complete \| archive=missing
		258	local missing_pairs_file
		259	missing_pairs_file=$(mktemp)
		260
		261	# Extract identifier and npub, convert npub to hex pubkey for matching
		262	while IFS=' \| ' read -r identifier npub rest; do
		263	# Skip empty lines
		264	[[ -z "$identifier" ]] && continue
		265	# Trim whitespace
		266	identifier=$(echo "$identifier" \| xargs)
		267	npub=$(echo "$npub" \| xargs)
		268	echo "${identifier}\|${npub}"
		269	done < "$missing_file" > "$missing_pairs_file"
		270
		271	local missing_count
		272	missing_count=$(wc -l < "$missing_pairs_file")
		273	missing_count="${missing_count//[^0-9]/}"
		274	log_info " Found $missing_count missing announcements to filter for"
		275
		276	# Step 2: Get event IDs from production announcements for these pairs
		277	# We need to match on 'd' tag (identifier) and pubkey
		278	local missing_event_ids_file
		279	missing_event_ids_file=$(mktemp)
		280
		281	# Create a lookup of identifier\|npub -> event_id from production announcements
		282	# The JSON has: id, pubkey (hex), tags (array with ["d", identifier])
		283	log_info " Extracting event IDs from production announcements..."
		284
		285	# Use jq to extract id, pubkey, and d-tag value, then filter
		286	# Output format: event_id\|identifier\|pubkey_hex
		287	# Note: The JSON file is NDJSON (newline-delimited), not an array
		288	jq -r 'select(.kind == 30617) \|
		289	.id as $id \|
		290	.pubkey as $pubkey \|
		291	(.tags[] \| select(.[0] == "d") \| .[1]) as $dtag \|
		292	"\($id)\|\($dtag)\|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null \|\| {
		293	log_warn "Failed to parse production announcements JSON"
		294	rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all"
		295	return 0
		296	}
		297
		298	# Now filter to only event IDs for missing announcements
		299	# We need to convert npub to hex pubkey for comparison
		300	# npub is bech32, pubkey in JSON is hex
		301	# For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey
		302	# Actually, we need both because same identifier can exist for different pubkeys
		303
		304	# Create a set of "identifier\|pubkey_hex" to match against
		305	# First, we need to convert npub to hex - but that requires a tool
		306	# Alternative: match on identifier only and accept some false positives
		307	# Better: use the comparison file which has npub, and match against announcements
		308
		309	# Let's match on identifier only for now (simpler, may have minor false positives)
		310	# Extract just the identifiers from missing announcements
		311	local missing_identifiers_file
		312	missing_identifiers_file=$(mktemp)
		313	cut -d'\|' -f1 "$missing_pairs_file" \| sort -u > "$missing_identifiers_file"
		314
		315	# Filter event IDs to only those with matching identifiers
		316	while IFS='\|' read -r event_id identifier pubkey_hex; do
		317	if grep -qFx "$identifier" "$missing_identifiers_file"; then
		318	echo "$event_id"
		319	fi
		320	done < "$missing_event_ids_file.all" \| sort -u > "$missing_event_ids_file"
		321
		322	local event_id_count
		323	event_id_count=$(wc -l < "$missing_event_ids_file")
		324	event_id_count="${event_id_count//[^0-9]/}"
		325	log_info " Found $event_id_count event IDs for missing announcements"
		326
		327	# Step 3: Filter parse failures to only those event IDs
		328	local filtered_file
		329	filtered_file=$(mktemp)
		330
		331	# Copy header lines
		332	grep '^#' "$parse_failures_file" > "$filtered_file"
		333
		334	# Add a note about filtering
		335	echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file"
		336	echo "# Analysis root: $analysis_root" >> "$filtered_file"
		337	echo "# Missing announcements: $missing_count" >> "$filtered_file"
		338	echo "# Matching event IDs: $event_id_count" >> "$filtered_file"
		339
		340	# Filter data lines - only include if event_id is in our list
		341	local filtered_count=0
		342	while IFS=$'\t' read -r event_id kind reason repo npub; do
		343	# Skip header lines (already copied)
		344	[[ "$event_id" =~ ^# ]] && continue
		345
		346	# Check if this event_id is in our missing list
		347	if grep -qFx "$event_id" "$missing_event_ids_file"; then
		348	printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file"
		349	filtered_count=$((filtered_count + 1))
		350	fi
		351	done < "$parse_failures_file"
		352
		353	# Replace original with filtered version
		354	mv "$filtered_file" "$parse_failures_file"
		355
		356	# Cleanup temp files
		357	rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file"
		358
		359	log_info " Filtered from $(grep -v '^#' "$parse_failures_file" \| wc -l \| xargs) to $filtered_count parse failures"
		360	log_success "Filtered to parse failures for missing announcements only"
		361	}
		362
207	# Main	363	# Main
208	main() {	364	main() {
209	if [[ $# -lt 2 ]]; then	365	if [[ $# -lt 2 ]]; then
@@ -219,6 +375,7 @@ main() {
219	since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null \|\| date -v-30d "+%Y-%m-%d" 2>/dev/null \|\| echo "")	375	since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null \|\| date -v-30d "+%Y-%m-%d" 2>/dev/null \|\| echo "")
220	local until_date=""	376	local until_date=""
221	local dry_run=false	377	local dry_run=false
		378	local analysis_root=""
222		379
223	# Parse options	380	# Parse options
224	while [[ $# -gt 0 ]]; do	381	while [[ $# -gt 0 ]]; do
@@ -231,6 +388,10 @@ main() {
231	until_date="$2"	388	until_date="$2"
232	shift 2	389	shift 2
233	;;	390	;;
		391	--analysis-root)
		392	analysis_root="$2"
		393	shift 2
		394	;;
234	--dry-run)	395	--dry-run)
235	dry_run=true	396	dry_run=true
236	shift	397	shift
@@ -469,6 +630,11 @@ main() {
469	grep -v '^#' "$output_file" \| sort -t$'\t' -k1,1 -u >> "$deduped_file"	630	grep -v '^#' "$output_file" \| sort -t$'\t' -k1,1 -u >> "$deduped_file"
470	mv "$deduped_file" "$output_file"	631	mv "$deduped_file" "$output_file"
471		632
		633	# Filter to missing announcements only if analysis root provided
		634	if [[ -n "$analysis_root" ]]; then
		635	filter_to_missing_announcements "$output_file" "$analysis_root"
		636	fi
		637
472	# Count final entries (excluding header lines)	638	# Count final entries (excluding header lines)
473	local count	639	local count
474	count=$(grep -v '^#' "$output_file" \| wc -l)	640	count=$(grep -v '^#' "$output_file" \| wc -l)
@@ -482,9 +648,15 @@ main() {
482	log_info "=== Extraction Summary ==="	648	log_info "=== Extraction Summary ==="
483	log_info "Service: $service"	649	log_info "Service: $service"
484	log_info "Time range: ${since_date:-beginning} to ${until_date:-now}"	650	log_info "Time range: ${since_date:-beginning} to ${until_date:-now}"
		651	if [[ -n "$analysis_root" ]]; then
		652	log_info "Filtered to: missing announcements only"
		653	fi
485	log_success "Extracted $count total entries"	654	log_success "Extracted $count total entries"
486	log_info " - [PARSE_FAIL] entries: $parse_fail_count"	655	log_info " - [PARSE_FAIL] entries: $parse_fail_count"
487	log_info " - Invalid announcement rejections: $invalid_announcement_count"	656	log_info " - Invalid announcement rejections: $invalid_announcement_count"
		657	if [[ -n "$analysis_root" ]]; then
		658	log_info " (filtered from original extraction)"
		659	fi
488	echo ""	660	echo ""
489	log_info "Output file: $output_file"	661	log_info "Output file: $output_file"
490		662