fix migration script for invalid announcement detection

author: DanConwayDev <DanConwayDev@protonmail.com> 2026-01-27 12:46:05 +0000
committer: DanConwayDev <DanConwayDev@protonmail.com> 2026-01-27 20:38:22 +0000
commit: a7d0d574b9788f71e3add39699b3a409c0f2b492 (patch)
tree: d47ef845ab2cf5183780e8601881d8df22e24e53 /docs/how-to
parent: 49b0df788255848173c01db394a2df29b7c08576 (diff)
1 files changed, 199 insertions, 120 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index d7f9706..d762aae 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -155,70 +155,107 @@ usage() {
    exit 1
 }
-# Parse a [PARSE_FAIL] log line and extract fields
+# =============================================================================
-# Input: log line containing [PARSE_FAIL]
+# AWK-BASED BATCH PARSING FUNCTIONS
-# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
+# =============================================================================
-parse_parse_fail_line() {
+# These functions use awk for efficient batch processing instead of per-line
-    local line="$1"
+# grep calls. This provides ~400x speedup for large log files.
-    
+#
-    # Extract fields using grep -oP (Perl regex) or awk
-    # Fields: kind, event_id, reason, repo (optional), npub (optional)
-    
-    local kind event_id reason repo npub
-    
-    # Extract kind=VALUE
-    kind=$(echo "$line" | grep -oP 'kind=\K[0-9]+' || echo "")
-    
-    # Extract event_id=VALUE (hex string, possibly truncated with ...)
-    event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
-    
-    # Extract reason="VALUE" (quoted string)
-    reason=$(echo "$line" | grep -oP 'reason="\K[^"]*' || echo "")
-    
-    # Extract repo=VALUE (optional, unquoted identifier)
-    repo=$(echo "$line" | grep -oP 'repo=\K[^ ]+' || echo "")
-    
-    # Extract npub=VALUE (optional, npub1... format)
-    npub=$(echo "$line" | grep -oP 'npub=\K[^ ]+' || echo "")
-    
-    # Only output if we have the required fields
-    if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
-        printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub"
-    fi
-}
-# Parse an "Invalid announcement" rejection log line from write policy
-# Input: log line containing "Event rejected by write policy" with "Invalid announcement"
-# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
-# Note: repo and npub are empty for these entries (not available in log format)
-parse_write_policy_rejection_line() {
-    local line="$1"
-    
-    local kind event_id reason
-    
-    # Extract event_id=VALUE (hex string)
-    event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
-    
-    # Extract kind=VALUE
-    kind=$(echo "$line" | grep -oP 'kind=\K[0-9]+' || echo "")
-    
-    # Extract reason=VALUE (everything after "reason=")
-    # The reason is unquoted and goes to end of line
-    reason=$(echo "$line" | grep -oP 'reason=\K.*$' || echo "")
-    
-    # Only output if we have the required fields
-    if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
-        # repo and npub are empty for invalid announcement entries
-        printf '%s\t%s\t%s\t\t\n' "$event_id" "$kind" "$reason"
-    fi
-}
 # NOTE: parse_builder_rejection_line() was removed to fix double-counting bug.
 # Builder logs use bech32 (note1) IDs while write policy logs use hex IDs.
 # Since deduplication only works within each format, extracting both caused
 # the same event to be counted twice. Write policy logs contain the same
 # events, so we don't lose any data by only extracting from that source.
+# Parse [PARSE_FAIL] log lines in batch using awk
+# Input: file containing log lines with [PARSE_FAIL]
+# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
+parse_parse_fail_batch() {
+    local input_file="$1"
+    awk '
+    {
+        # Extract kind=VALUE
+        kind = ""
+        if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
+        
+        # Extract event_id=VALUE (hex string)
+        event_id = ""
+        if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
+        
+        # Extract reason="VALUE" (quoted string)
+        reason = ""
+        if (match($0, /reason="([^"]*)"/, m)) reason = m[1]
+        
+        # Extract repo=VALUE (optional)
+        repo = ""
+        if (match($0, /repo=([^ ]+)/, m)) repo = m[1]
+        
+        # Extract npub=VALUE (optional)
+        npub = ""
+        if (match($0, /npub=([^ ]+)/, m)) npub = m[1]
+        
+        # Output if we have required fields
+        if (kind != "" && event_id != "" && reason != "") {
+            print event_id "\t" kind "\t" reason "\t" repo "\t" npub
+        }
+    }
+    ' "$input_file"
+}
+# Parse "Invalid announcement" rejection log lines in batch using awk
+# Input: file containing "Event rejected by write policy" log lines
+# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB><empty><TAB><empty>
+parse_write_policy_rejection_batch() {
+    local input_file="$1"
+    awk '
+    {
+        # Extract event_id=VALUE (hex string)
+        event_id = ""
+        if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
+        
+        # Extract kind=VALUE
+        kind = ""
+        if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
+        
+        # Extract reason=VALUE (everything after "reason=")
+        reason = ""
+        if (match($0, /reason=(.*)$/, m)) reason = m[1]
+        
+        # Output if we have required fields (repo and npub are empty)
+        if (kind != "" && event_id != "" && reason != "") {
+            print event_id "\t" kind "\t" reason "\t\t"
+        }
+    }
+    ' "$input_file"
+}
+# Parse "Added rejected announcement" log lines in batch using awk
+# Input: file containing "Added rejected announcement to two-tier index" log lines
+# Output: TSV lines: event_id<TAB>identifier<TAB>pubkey_hex
+parse_rejected_announcement_batch() {
+    local input_file="$1"
+    awk '
+    {
+        # Extract event_id=VALUE (hex string)
+        event_id = ""
+        if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
+        
+        # Extract identifier=VALUE (repo name)
+        identifier = ""
+        if (match($0, /identifier=([^ ]+)/, m)) identifier = m[1]
+        
+        # Extract pubkey=VALUE (hex string)
+        pubkey = ""
+        if (match($0, /pubkey=([a-f0-9]+)/, m)) pubkey = m[1]
+        
+        # Output if we have all required fields
+        if (event_id != "" && identifier != "" && pubkey != "") {
+            print event_id "\t" identifier "\t" pubkey
+        }
+    }
+    ' "$input_file"
+}
 # Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries
 # This is critical because "Invalid announcement" rejections only log event_id and kind,
 # not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead
@@ -233,6 +270,11 @@ parse_write_policy_rejection_line() {
 #   2. For each parse failure with empty repo/npub, looks up the event_id
 #   3. Populates repo and npub columns from the lookup
 #   4. Converts hex pubkeys to npub format using `nak encode npub` if available
+#
+# OPTIMIZATION: This function uses batch processing for efficiency:
+#   - Uses awk for O(n) join instead of per-line grep (O(n*m))
+#   - Batches all pubkey->npub conversions in a single nak call
+#   - This reduces runtime from minutes to seconds for large datasets
 enrich_with_repo_npub() {
    local parse_failures_file="$1"
    local lookup_file="$2"
@@ -259,52 +301,98 @@ enrich_with_repo_npub() {
    lookup_count="${lookup_count//[^0-9]/}"
    log_info "  Lookup table has $lookup_count entries"
    
-    # Enrich parse failures
+    # STEP 1: Extract unique pubkeys that need conversion
+    # Get pubkeys from lookup file (column 3), deduplicate
+    local unique_pubkeys_file npub_map_file
+    unique_pubkeys_file=$(mktemp)
+    npub_map_file=$(mktemp)
+    
+    cut -f3 "$lookup_file" | sort -u > "$unique_pubkeys_file"
+    local unique_pubkey_count
+    unique_pubkey_count=$(wc -l < "$unique_pubkeys_file")
+    unique_pubkey_count="${unique_pubkey_count//[^0-9]/}"
+    log_info "  Converting $unique_pubkey_count unique pubkeys to npub format..."
+    
+    # STEP 2: Batch convert all pubkeys to npub in a single nak call
+    # nak reads hex pubkeys from stdin (one per line) and outputs npubs
+    if [[ "$can_convert_npub" == true && "$unique_pubkey_count" -gt 0 ]]; then
+        # Create mapping file: pubkey_hex<TAB>npub
+        # nak encode npub reads from stdin and outputs one npub per line
+        paste "$unique_pubkeys_file" <(nak encode npub < "$unique_pubkeys_file" 2>/dev/null) > "$npub_map_file" || {
+            # Fallback: if batch conversion fails, use hex pubkeys
+            log_warn "  Batch npub conversion failed, using hex pubkeys"
+            awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
+        }
+    else
+        # No nak available, use hex pubkeys as-is
+        awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
+    fi
+    
+    rm -f "$unique_pubkeys_file"
+    
+    # STEP 3: Use awk for efficient join (O(n) instead of O(n*m) grep per line)
+    # This joins parse_failures with lookup_file on event_id, then with npub_map on pubkey
    local enriched_file
    enriched_file=$(mktemp)
    
    # Copy header lines
    grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true
    
-    # Process data lines
+    # Use awk to perform the join efficiently
-    local enriched_count=0
+    # Input files (order matters for ARGIND):
-    local total_count=0
+    #   1. npub_map_file: pubkey_hex<TAB>npub
-    while IFS=$'\t' read -r event_id kind reason repo npub; do
+    #   2. lookup_file: event_id<TAB>identifier<TAB>pubkey_hex
-        # Skip header lines (already copied)
+    #   3. parse_failures_file: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
-        [[ "$event_id" =~ ^# ]] && continue
+    awk -F'\t' -v OFS='\t' '
-        
+        # Track which file we are processing
-        total_count=$((total_count + 1))
+        FNR==1 { file_num++ }
-        
-        # If repo and npub are already populated, keep them
-        if [[ -n "$repo" && -n "$npub" ]]; then
-            printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"
-            continue
-        fi
-        
-        # Look up event_id in our table (format: event_id<TAB>identifier<TAB>pubkey_hex)
-        local lookup_result
-        lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "")
        
-        if [[ -n "$lookup_result" ]]; then
+        # First file: npub_map (pubkey_hex -> npub)
-            local looked_up_repo looked_up_pubkey_hex looked_up_npub
+        file_num==1 {
-            looked_up_repo=$(echo "$lookup_result" | cut -f2)
+            npub_map[$1] = $2
-            looked_up_pubkey_hex=$(echo "$lookup_result" | cut -f3)
+            next
+        }
+        # Second file: lookup (event_id -> identifier, pubkey_hex)
+        file_num==2 {
+            lookup_repo[$1] = $2
+            lookup_pubkey[$1] = $3
+            next
+        }
+        # Third file: parse_failures
+        /^#/ { next }  # Skip headers (already copied)
+        {
+            event_id = $1
+            kind = $2
+            reason = $3
+            repo = $4
+            npub = $5
            
-            # Convert hex pubkey to npub if nak is available
+            # If repo/npub empty, try to enrich from lookup
-            if [[ "$can_convert_npub" == true && -n "$looked_up_pubkey_hex" ]]; then
+            if (repo == "" && event_id in lookup_repo) {
-                looked_up_npub=$(nak encode npub "$looked_up_pubkey_hex" 2>/dev/null || echo "$looked_up_pubkey_hex")
+                repo = lookup_repo[event_id]
-            else
+            }
-                looked_up_npub="$looked_up_pubkey_hex"
+            if (npub == "" && event_id in lookup_pubkey) {
-            fi
+                pubkey = lookup_pubkey[event_id]
+                if (pubkey in npub_map) {
+                    npub = npub_map[pubkey]
+                } else {
+                    npub = pubkey  # Fallback to hex
+                }
+            }
            
-            # Use looked-up values if original was empty
+            print event_id, kind, reason, repo, npub
-            [[ -z "$repo" ]] && repo="$looked_up_repo"
+        }
-            [[ -z "$npub" ]] && npub="$looked_up_npub"
+    ' "$npub_map_file" "$lookup_file" "$parse_failures_file" >> "$enriched_file"
-            enriched_count=$((enriched_count + 1))
+    
-        fi
+    rm -f "$npub_map_file"
-        
+    
-        printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"
+    # Count enriched entries
-    done < "$parse_failures_file"
+    local enriched_count total_count
+    total_count=$(grep -v '^#' "$parse_failures_file" | wc -l)
+    total_count="${total_count//[^0-9]/}"
+    # Count entries that have non-empty repo AND npub after enrichment
+    enriched_count=$(grep -v '^#' "$enriched_file" | awk -F'\t' '$4 != "" && $5 != ""' | wc -l)
+    enriched_count="${enriched_count//[^0-9]/}"
    
    # Replace original with enriched version
    mv "$enriched_file" "$parse_failures_file"
@@ -569,32 +657,29 @@ main() {
        echo "# Note: repo and npub may be empty for some entries"
    } > "$output_file"
    
-    # Parse [PARSE_FAIL] entries
+    # Parse [PARSE_FAIL] entries using batch awk processing
    log_info "  Parsing [PARSE_FAIL] entries..."
    local parse_fail_count=0
    if [[ "$parse_fail_line_count" -gt 0 ]]; then
-        while IFS= read -r line; do
+        parse_parse_fail_batch "$temp_parse_fail" >> "$output_file"
-            local parsed
+        parse_fail_count=$(grep -v '^#' "$output_file" | wc -l)
-            parsed=$(parse_parse_fail_line "$line")
+        parse_fail_count="${parse_fail_count//[^0-9]/}"
-            if [[ -n "$parsed" ]]; then
-                echo "$parsed" >> "$output_file"
-                parse_fail_count=$((parse_fail_count + 1))
-            fi
-        done < "$temp_parse_fail"
    fi
    
-    # Parse write policy rejection entries
+    # Parse write policy rejection entries using batch awk processing
    log_info "  Parsing write policy rejection entries..."
    local write_policy_count=0
    if [[ "$write_policy_line_count" -gt 0 ]]; then
-        while IFS= read -r line; do
+        local before_count
-            local parsed
+        before_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0")
-            parsed=$(parse_write_policy_rejection_line "$line")
+        before_count="${before_count//[^0-9]/}"
-            if [[ -n "$parsed" ]]; then
+        before_count="${before_count:-0}"
-                echo "$parsed" >> "$output_file"
+        parse_write_policy_rejection_batch "$temp_write_policy_rejection" >> "$output_file"
-                write_policy_count=$((write_policy_count + 1))
+        local after_count
-            fi
+        after_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0")
-        done < "$temp_write_policy_rejection"
+        after_count="${after_count//[^0-9]/}"
+        after_count="${after_count:-0}"
+        write_policy_count=$((after_count - before_count))
    fi
    
    local invalid_announcement_count=$write_policy_count
@@ -605,13 +690,7 @@ main() {
    
    log_info "  Building enrichment lookup table..."
    if [[ "$rejected_announcement_line_count" -gt 0 ]]; then
-        while IFS= read -r line; do
+        parse_rejected_announcement_batch "$temp_rejected_announcement" > "$enrichment_lookup_file"
-            local parsed
-            parsed=$(parse_rejected_announcement_line "$line")
-            if [[ -n "$parsed" ]]; then
-                echo "$parsed" >> "$enrichment_lookup_file"
-            fi
-        done < "$temp_rejected_announcement"
    fi
    
    rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"
author	DanConwayDev <DanConwayDev@protonmail.com>	2026-01-27 12:46:05 +0000
committer	DanConwayDev <DanConwayDev@protonmail.com>	2026-01-27 20:38:22 +0000
commit	a7d0d574b9788f71e3add39699b3a409c0f2b492 (patch)
tree	d47ef845ab2cf5183780e8601881d8df22e24e53 /docs/how-to
parent	49b0df788255848173c01db394a2df29b7c08576 (diff)

diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh index d7f9706..d762aae 100755 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -155,70 +155,107 @@ usage() {
155	exit 1	155	exit 1
156	}	156	}
157		157
158	# Parse a [PARSE_FAIL] log line and extract fields	158	# =============================================================================
159	# Input: log line containing [PARSE_FAIL]	159	# AWK-BASED BATCH PARSING FUNCTIONS
160	# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub	160	# =============================================================================
161	parse_parse_fail_line() {	161	# These functions use awk for efficient batch processing instead of per-line
162	local line="$1"	162	# grep calls. This provides ~400x speedup for large log files.
163		163	#
164	# Extract fields using grep -oP (Perl regex) or awk
165	# Fields: kind, event_id, reason, repo (optional), npub (optional)
166
167	local kind event_id reason repo npub
168
169	# Extract kind=VALUE
170	kind=$(echo "$line" \| grep -oP 'kind=\K[0-9]+' \|\| echo "")
171
172	# Extract event_id=VALUE (hex string, possibly truncated with ...)
173	event_id=$(echo "$line" \| grep -oP 'event_id=\K[a-f0-9]+' \|\| echo "")
174
175	# Extract reason="VALUE" (quoted string)
176	reason=$(echo "$line" \| grep -oP 'reason="\K[^"]*' \|\| echo "")
177
178	# Extract repo=VALUE (optional, unquoted identifier)
179	repo=$(echo "$line" \| grep -oP 'repo=\K[^ ]+' \|\| echo "")
180
181	# Extract npub=VALUE (optional, npub1... format)
182	npub=$(echo "$line" \| grep -oP 'npub=\K[^ ]+' \|\| echo "")
183
184	# Only output if we have the required fields
185	if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
186	printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub"
187	fi
188	}
189
190	# Parse an "Invalid announcement" rejection log line from write policy
191	# Input: log line containing "Event rejected by write policy" with "Invalid announcement"
192	# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
193	# Note: repo and npub are empty for these entries (not available in log format)
194	parse_write_policy_rejection_line() {
195	local line="$1"
196
197	local kind event_id reason
198
199	# Extract event_id=VALUE (hex string)
200	event_id=$(echo "$line" \| grep -oP 'event_id=\K[a-f0-9]+' \|\| echo "")
201
202	# Extract kind=VALUE
203	kind=$(echo "$line" \| grep -oP 'kind=\K[0-9]+' \|\| echo "")
204
205	# Extract reason=VALUE (everything after "reason=")
206	# The reason is unquoted and goes to end of line
207	reason=$(echo "$line" \| grep -oP 'reason=\K.*$' \|\| echo "")
208
209	# Only output if we have the required fields
210	if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
211	# repo and npub are empty for invalid announcement entries
212	printf '%s\t%s\t%s\t\t\n' "$event_id" "$kind" "$reason"
213	fi
214	}
215
216	# NOTE: parse_builder_rejection_line() was removed to fix double-counting bug.	164	# NOTE: parse_builder_rejection_line() was removed to fix double-counting bug.
217	# Builder logs use bech32 (note1) IDs while write policy logs use hex IDs.	165	# Builder logs use bech32 (note1) IDs while write policy logs use hex IDs.
218	# Since deduplication only works within each format, extracting both caused	166	# Since deduplication only works within each format, extracting both caused
219	# the same event to be counted twice. Write policy logs contain the same	167	# the same event to be counted twice. Write policy logs contain the same
220	# events, so we don't lose any data by only extracting from that source.	168	# events, so we don't lose any data by only extracting from that source.
221		169
		170	# Parse [PARSE_FAIL] log lines in batch using awk
		171	# Input: file containing log lines with [PARSE_FAIL]
		172	# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
		173	parse_parse_fail_batch() {
		174	local input_file="$1"
		175	awk '
		176	{
		177	# Extract kind=VALUE
		178	kind = ""
		179	if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
		180
		181	# Extract event_id=VALUE (hex string)
		182	event_id = ""
		183	if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
		184
		185	# Extract reason="VALUE" (quoted string)
		186	reason = ""
		187	if (match($0, /reason="([^"]*)"/, m)) reason = m[1]
		188
		189	# Extract repo=VALUE (optional)
		190	repo = ""
		191	if (match($0, /repo=([^ ]+)/, m)) repo = m[1]
		192
		193	# Extract npub=VALUE (optional)
		194	npub = ""
		195	if (match($0, /npub=([^ ]+)/, m)) npub = m[1]
		196
		197	# Output if we have required fields
		198	if (kind != "" && event_id != "" && reason != "") {
		199	print event_id "\t" kind "\t" reason "\t" repo "\t" npub
		200	}
		201	}
		202	' "$input_file"
		203	}
		204
		205	# Parse "Invalid announcement" rejection log lines in batch using awk
		206	# Input: file containing "Event rejected by write policy" log lines
		207	# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB><empty><TAB><empty>
		208	parse_write_policy_rejection_batch() {
		209	local input_file="$1"
		210	awk '
		211	{
		212	# Extract event_id=VALUE (hex string)
		213	event_id = ""
		214	if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
		215
		216	# Extract kind=VALUE
		217	kind = ""
		218	if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
		219
		220	# Extract reason=VALUE (everything after "reason=")
		221	reason = ""
		222	if (match($0, /reason=(.*)$/, m)) reason = m[1]
		223
		224	# Output if we have required fields (repo and npub are empty)
		225	if (kind != "" && event_id != "" && reason != "") {
		226	print event_id "\t" kind "\t" reason "\t\t"
		227	}
		228	}
		229	' "$input_file"
		230	}
		231
		232	# Parse "Added rejected announcement" log lines in batch using awk
		233	# Input: file containing "Added rejected announcement to two-tier index" log lines
		234	# Output: TSV lines: event_id<TAB>identifier<TAB>pubkey_hex
		235	parse_rejected_announcement_batch() {
		236	local input_file="$1"
		237	awk '
		238	{
		239	# Extract event_id=VALUE (hex string)
		240	event_id = ""
		241	if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
		242
		243	# Extract identifier=VALUE (repo name)
		244	identifier = ""
		245	if (match($0, /identifier=([^ ]+)/, m)) identifier = m[1]
		246
		247	# Extract pubkey=VALUE (hex string)
		248	pubkey = ""
		249	if (match($0, /pubkey=([a-f0-9]+)/, m)) pubkey = m[1]
		250
		251	# Output if we have all required fields
		252	if (event_id != "" && identifier != "" && pubkey != "") {
		253	print event_id "\t" identifier "\t" pubkey
		254	}
		255	}
		256	' "$input_file"
		257	}
		258
222	# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries	259	# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries
223	# This is critical because "Invalid announcement" rejections only log event_id and kind,	260	# This is critical because "Invalid announcement" rejections only log event_id and kind,
224	# not the repo name or npub. Without enrichment, Phase 5 shows event_id\|kind instead	261	# not the repo name or npub. Without enrichment, Phase 5 shows event_id\|kind instead
@@ -233,6 +270,11 @@ parse_write_policy_rejection_line() {
233	# 2. For each parse failure with empty repo/npub, looks up the event_id	270	# 2. For each parse failure with empty repo/npub, looks up the event_id
234	# 3. Populates repo and npub columns from the lookup	271	# 3. Populates repo and npub columns from the lookup
235	# 4. Converts hex pubkeys to npub format using `nak encode npub` if available	272	# 4. Converts hex pubkeys to npub format using `nak encode npub` if available
		273	#
		274	# OPTIMIZATION: This function uses batch processing for efficiency:
		275	# - Uses awk for O(n) join instead of per-line grep (O(n*m))
		276	# - Batches all pubkey->npub conversions in a single nak call
		277	# - This reduces runtime from minutes to seconds for large datasets
236	enrich_with_repo_npub() {	278	enrich_with_repo_npub() {
237	local parse_failures_file="$1"	279	local parse_failures_file="$1"
238	local lookup_file="$2"	280	local lookup_file="$2"
@@ -259,52 +301,98 @@ enrich_with_repo_npub() {
259	lookup_count="${lookup_count//[^0-9]/}"	301	lookup_count="${lookup_count//[^0-9]/}"
260	log_info " Lookup table has $lookup_count entries"	302	log_info " Lookup table has $lookup_count entries"
261		303
262	# Enrich parse failures	304	# STEP 1: Extract unique pubkeys that need conversion
		305	# Get pubkeys from lookup file (column 3), deduplicate
		306	local unique_pubkeys_file npub_map_file
		307	unique_pubkeys_file=$(mktemp)
		308	npub_map_file=$(mktemp)
		309
		310	cut -f3 "$lookup_file" \| sort -u > "$unique_pubkeys_file"
		311	local unique_pubkey_count
		312	unique_pubkey_count=$(wc -l < "$unique_pubkeys_file")
		313	unique_pubkey_count="${unique_pubkey_count//[^0-9]/}"
		314	log_info " Converting $unique_pubkey_count unique pubkeys to npub format..."
		315
		316	# STEP 2: Batch convert all pubkeys to npub in a single nak call
		317	# nak reads hex pubkeys from stdin (one per line) and outputs npubs
		318	if [[ "$can_convert_npub" == true && "$unique_pubkey_count" -gt 0 ]]; then
		319	# Create mapping file: pubkey_hex<TAB>npub
		320	# nak encode npub reads from stdin and outputs one npub per line
		321	paste "$unique_pubkeys_file" <(nak encode npub < "$unique_pubkeys_file" 2>/dev/null) > "$npub_map_file" \|\| {
		322	# Fallback: if batch conversion fails, use hex pubkeys
		323	log_warn " Batch npub conversion failed, using hex pubkeys"
		324	awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
		325	}
		326	else
		327	# No nak available, use hex pubkeys as-is
		328	awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
		329	fi
		330
		331	rm -f "$unique_pubkeys_file"
		332
		333	# STEP 3: Use awk for efficient join (O(n) instead of O(n*m) grep per line)
		334	# This joins parse_failures with lookup_file on event_id, then with npub_map on pubkey
263	local enriched_file	335	local enriched_file
264	enriched_file=$(mktemp)	336	enriched_file=$(mktemp)
265		337
266	# Copy header lines	338	# Copy header lines
267	grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null \|\| true	339	grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null \|\| true
268		340
269	# Process data lines	341	# Use awk to perform the join efficiently
270	local enriched_count=0	342	# Input files (order matters for ARGIND):
271	local total_count=0	343	# 1. npub_map_file: pubkey_hex<TAB>npub
272	while IFS=$'\t' read -r event_id kind reason repo npub; do	344	# 2. lookup_file: event_id<TAB>identifier<TAB>pubkey_hex
273	# Skip header lines (already copied)	345	# 3. parse_failures_file: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
274	[[ "$event_id" =~ ^# ]] && continue	346	awk -F'\t' -v OFS='\t' '
275		347	# Track which file we are processing
276	total_count=$((total_count + 1))	348	FNR==1 { file_num++ }
277
278	# If repo and npub are already populated, keep them
279	if [[ -n "$repo" && -n "$npub" ]]; then
280	printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"
281	continue
282	fi
283
284	# Look up event_id in our table (format: event_id<TAB>identifier<TAB>pubkey_hex)
285	local lookup_result
286	lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null \| head -1 \|\| echo "")
287		349
288	if [[ -n "$lookup_result" ]]; then	350	# First file: npub_map (pubkey_hex -> npub)
289	local looked_up_repo looked_up_pubkey_hex looked_up_npub	351	file_num==1 {
290	looked_up_repo=$(echo "$lookup_result" \| cut -f2)	352	npub_map[$1] = $2
291	looked_up_pubkey_hex=$(echo "$lookup_result" \| cut -f3)	353	next
		354	}
		355	# Second file: lookup (event_id -> identifier, pubkey_hex)
		356	file_num==2 {
		357	lookup_repo[$1] = $2
		358	lookup_pubkey[$1] = $3
		359	next
		360	}
		361	# Third file: parse_failures
		362	/^#/ { next } # Skip headers (already copied)
		363	{
		364	event_id = $1
		365	kind = $2
		366	reason = $3
		367	repo = $4
		368	npub = $5
292		369
293	# Convert hex pubkey to npub if nak is available	370	# If repo/npub empty, try to enrich from lookup
294	if [[ "$can_convert_npub" == true && -n "$looked_up_pubkey_hex" ]]; then	371	if (repo == "" && event_id in lookup_repo) {
295	looked_up_npub=$(nak encode npub "$looked_up_pubkey_hex" 2>/dev/null \|\| echo "$looked_up_pubkey_hex")	372	repo = lookup_repo[event_id]
296	else	373	}
297	looked_up_npub="$looked_up_pubkey_hex"	374	if (npub == "" && event_id in lookup_pubkey) {
298	fi	375	pubkey = lookup_pubkey[event_id]
		376	if (pubkey in npub_map) {
		377	npub = npub_map[pubkey]
		378	} else {
		379	npub = pubkey # Fallback to hex
		380	}
		381	}
299		382
300	# Use looked-up values if original was empty	383	print event_id, kind, reason, repo, npub
301	[[ -z "$repo" ]] && repo="$looked_up_repo"	384	}
302	[[ -z "$npub" ]] && npub="$looked_up_npub"	385	' "$npub_map_file" "$lookup_file" "$parse_failures_file" >> "$enriched_file"
303	enriched_count=$((enriched_count + 1))	386
304	fi	387	rm -f "$npub_map_file"
305		388
306	printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"	389	# Count enriched entries
307	done < "$parse_failures_file"	390	local enriched_count total_count
		391	total_count=$(grep -v '^#' "$parse_failures_file" \| wc -l)
		392	total_count="${total_count//[^0-9]/}"
		393	# Count entries that have non-empty repo AND npub after enrichment
		394	enriched_count=$(grep -v '^#' "$enriched_file" \| awk -F'\t' '$4 != "" && $5 != ""' \| wc -l)
		395	enriched_count="${enriched_count//[^0-9]/}"
308		396
309	# Replace original with enriched version	397	# Replace original with enriched version
310	mv "$enriched_file" "$parse_failures_file"	398	mv "$enriched_file" "$parse_failures_file"
@@ -569,32 +657,29 @@ main() {
569	echo "# Note: repo and npub may be empty for some entries"	657	echo "# Note: repo and npub may be empty for some entries"
570	} > "$output_file"	658	} > "$output_file"
571		659
572	# Parse [PARSE_FAIL] entries	660	# Parse [PARSE_FAIL] entries using batch awk processing
573	log_info " Parsing [PARSE_FAIL] entries..."	661	log_info " Parsing [PARSE_FAIL] entries..."
574	local parse_fail_count=0	662	local parse_fail_count=0
575	if [[ "$parse_fail_line_count" -gt 0 ]]; then	663	if [[ "$parse_fail_line_count" -gt 0 ]]; then
576	while IFS= read -r line; do	664	parse_parse_fail_batch "$temp_parse_fail" >> "$output_file"
577	local parsed	665	parse_fail_count=$(grep -v '^#' "$output_file" \| wc -l)
578	parsed=$(parse_parse_fail_line "$line")	666	parse_fail_count="${parse_fail_count//[^0-9]/}"
579	if [[ -n "$parsed" ]]; then
580	echo "$parsed" >> "$output_file"
581	parse_fail_count=$((parse_fail_count + 1))
582	fi
583	done < "$temp_parse_fail"
584	fi	667	fi
585		668
586	# Parse write policy rejection entries	669	# Parse write policy rejection entries using batch awk processing
587	log_info " Parsing write policy rejection entries..."	670	log_info " Parsing write policy rejection entries..."
588	local write_policy_count=0	671	local write_policy_count=0
589	if [[ "$write_policy_line_count" -gt 0 ]]; then	672	if [[ "$write_policy_line_count" -gt 0 ]]; then
590	while IFS= read -r line; do	673	local before_count
591	local parsed	674	before_count=$(grep -v '^#' "$output_file" 2>/dev/null \| wc -l \|\| echo "0")
592	parsed=$(parse_write_policy_rejection_line "$line")	675	before_count="${before_count//[^0-9]/}"
593	if [[ -n "$parsed" ]]; then	676	before_count="${before_count:-0}"
594	echo "$parsed" >> "$output_file"	677	parse_write_policy_rejection_batch "$temp_write_policy_rejection" >> "$output_file"
595	write_policy_count=$((write_policy_count + 1))	678	local after_count
596	fi	679	after_count=$(grep -v '^#' "$output_file" 2>/dev/null \| wc -l \|\| echo "0")
597	done < "$temp_write_policy_rejection"	680	after_count="${after_count//[^0-9]/}"
		681	after_count="${after_count:-0}"
		682	write_policy_count=$((after_count - before_count))
598	fi	683	fi
599		684
600	local invalid_announcement_count=$write_policy_count	685	local invalid_announcement_count=$write_policy_count
@@ -605,13 +690,7 @@ main() {
605		690
606	log_info " Building enrichment lookup table..."	691	log_info " Building enrichment lookup table..."
607	if [[ "$rejected_announcement_line_count" -gt 0 ]]; then	692	if [[ "$rejected_announcement_line_count" -gt 0 ]]; then
608	while IFS= read -r line; do	693	parse_rejected_announcement_batch "$temp_rejected_announcement" > "$enrichment_lookup_file"
609	local parsed
610	parsed=$(parse_rejected_announcement_line "$line")
611	if [[ -n "$parsed" ]]; then
612	echo "$parsed" >> "$enrichment_lookup_file"
613	fi
614	done < "$temp_rejected_announcement"
615	fi	694	fi
616		695
617	rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"	696	rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"