From a7d0d574b9788f71e3add39699b3a409c0f2b492 Mon Sep 17 00:00:00 2001
From: DanConwayDev <DanConwayDev@protonmail.com>
Date: Tue, 27 Jan 2026 12:46:05 +0000
Subject: fix migration script for invalid announcement detection

---
 .../migration-scripts/30-extract-parse-failures.sh | 319 +++++++++++++--------
 1 file changed, 199 insertions(+), 120 deletions(-)

(limited to 'docs/how-to/migration-scripts/30-extract-parse-failures.sh')
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index d7f9706..d762aae 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -155,70 +155,107 @@ usage() {
     exit 1
 }
 
-# Parse a [PARSE_FAIL] log line and extract fields
-# Input: log line containing [PARSE_FAIL]
-# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
-parse_parse_fail_line() {
-    local line="$1"
-    
-    # Extract fields using grep -oP (Perl regex) or awk
-    # Fields: kind, event_id, reason, repo (optional), npub (optional)
-    
-    local kind event_id reason repo npub
-    
-    # Extract kind=VALUE
-    kind=$(echo "$line" | grep -oP 'kind=\K[0-9]+' || echo "")
-    
-    # Extract event_id=VALUE (hex string, possibly truncated with ...)
-    event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
-    
-    # Extract reason="VALUE" (quoted string)
-    reason=$(echo "$line" | grep -oP 'reason="\K[^"]*' || echo "")
-    
-    # Extract repo=VALUE (optional, unquoted identifier)
-    repo=$(echo "$line" | grep -oP 'repo=\K[^ ]+' || echo "")
-    
-    # Extract npub=VALUE (optional, npub1... format)
-    npub=$(echo "$line" | grep -oP 'npub=\K[^ ]+' || echo "")
-    
-    # Only output if we have the required fields
-    if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
-        printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub"
-    fi
-}
-
-# Parse an "Invalid announcement" rejection log line from write policy
-# Input: log line containing "Event rejected by write policy" with "Invalid announcement"
-# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
-# Note: repo and npub are empty for these entries (not available in log format)
-parse_write_policy_rejection_line() {
-    local line="$1"
-    
-    local kind event_id reason
-    
-    # Extract event_id=VALUE (hex string)
-    event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
-    
-    # Extract kind=VALUE
-    kind=$(echo "$line" | grep -oP 'kind=\K[0-9]+' || echo "")
-    
-    # Extract reason=VALUE (everything after "reason=")
-    # The reason is unquoted and goes to end of line
-    reason=$(echo "$line" | grep -oP 'reason=\K.*$' || echo "")
-    
-    # Only output if we have the required fields
-    if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
-        # repo and npub are empty for invalid announcement entries
-        printf '%s\t%s\t%s\t\t\n' "$event_id" "$kind" "$reason"
-    fi
-}
-
+# =============================================================================
+# AWK-BASED BATCH PARSING FUNCTIONS
+# =============================================================================
+# These functions use awk for efficient batch processing instead of per-line
+# grep calls. This provides ~400x speedup for large log files.
+#
 # NOTE: parse_builder_rejection_line() was removed to fix double-counting bug.
 # Builder logs use bech32 (note1) IDs while write policy logs use hex IDs.
 # Since deduplication only works within each format, extracting both caused
 # the same event to be counted twice. Write policy logs contain the same
 # events, so we don't lose any data by only extracting from that source.
 
+# Parse [PARSE_FAIL] log lines in batch using awk
+# Input: file containing log lines with [PARSE_FAIL]
+# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
+parse_parse_fail_batch() {
+    local input_file="$1"
+    awk '
+    {
+        # Extract kind=VALUE
+        kind = ""
+        if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
+        
+        # Extract event_id=VALUE (hex string)
+        event_id = ""
+        if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
+        
+        # Extract reason="VALUE" (quoted string)
+        reason = ""
+        if (match($0, /reason="([^"]*)"/, m)) reason = m[1]
+        
+        # Extract repo=VALUE (optional)
+        repo = ""
+        if (match($0, /repo=([^ ]+)/, m)) repo = m[1]
+        
+        # Extract npub=VALUE (optional)
+        npub = ""
+        if (match($0, /npub=([^ ]+)/, m)) npub = m[1]
+        
+        # Output if we have required fields
+        if (kind != "" && event_id != "" && reason != "") {
+            print event_id "\t" kind "\t" reason "\t" repo "\t" npub
+        }
+    }
+    ' "$input_file"
+}
+
+# Parse "Invalid announcement" rejection log lines in batch using awk
+# Input: file containing "Event rejected by write policy" log lines
+# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB><empty><TAB><empty>
+parse_write_policy_rejection_batch() {
+    local input_file="$1"
+    awk '
+    {
+        # Extract event_id=VALUE (hex string)
+        event_id = ""
+        if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
+        
+        # Extract kind=VALUE
+        kind = ""
+        if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
+        
+        # Extract reason=VALUE (everything after "reason=")
+        reason = ""
+        if (match($0, /reason=(.*)$/, m)) reason = m[1]
+        
+        # Output if we have required fields (repo and npub are empty)
+        if (kind != "" && event_id != "" && reason != "") {
+            print event_id "\t" kind "\t" reason "\t\t"
+        }
+    }
+    ' "$input_file"
+}
+
+# Parse "Added rejected announcement" log lines in batch using awk
+# Input: file containing "Added rejected announcement to two-tier index" log lines
+# Output: TSV lines: event_id<TAB>identifier<TAB>pubkey_hex
+parse_rejected_announcement_batch() {
+    local input_file="$1"
+    awk '
+    {
+        # Extract event_id=VALUE (hex string)
+        event_id = ""
+        if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
+        
+        # Extract identifier=VALUE (repo name)
+        identifier = ""
+        if (match($0, /identifier=([^ ]+)/, m)) identifier = m[1]
+        
+        # Extract pubkey=VALUE (hex string)
+        pubkey = ""
+        if (match($0, /pubkey=([a-f0-9]+)/, m)) pubkey = m[1]
+        
+        # Output if we have all required fields
+        if (event_id != "" && identifier != "" && pubkey != "") {
+            print event_id "\t" identifier "\t" pubkey
+        }
+    }
+    ' "$input_file"
+}
+
 # Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries
 # This is critical because "Invalid announcement" rejections only log event_id and kind,
 # not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead
@@ -233,6 +270,11 @@ parse_write_policy_rejection_line() {
 #   2. For each parse failure with empty repo/npub, looks up the event_id
 #   3. Populates repo and npub columns from the lookup
 #   4. Converts hex pubkeys to npub format using `nak encode npub` if available
+#
+# OPTIMIZATION: This function uses batch processing for efficiency:
+#   - Uses awk for O(n) join instead of per-line grep (O(n*m))
+#   - Batches all pubkey->npub conversions in a single nak call
+#   - This reduces runtime from minutes to seconds for large datasets
 enrich_with_repo_npub() {
     local parse_failures_file="$1"
     local lookup_file="$2"
@@ -259,52 +301,98 @@ enrich_with_repo_npub() {
     lookup_count="${lookup_count//[^0-9]/}"
     log_info "  Lookup table has $lookup_count entries"
     
-    # Enrich parse failures
+    # STEP 1: Extract unique pubkeys that need conversion
+    # Get pubkeys from lookup file (column 3), deduplicate
+    local unique_pubkeys_file npub_map_file
+    unique_pubkeys_file=$(mktemp)
+    npub_map_file=$(mktemp)
+    
+    cut -f3 "$lookup_file" | sort -u > "$unique_pubkeys_file"
+    local unique_pubkey_count
+    unique_pubkey_count=$(wc -l < "$unique_pubkeys_file")
+    unique_pubkey_count="${unique_pubkey_count//[^0-9]/}"
+    log_info "  Converting $unique_pubkey_count unique pubkeys to npub format..."
+    
+    # STEP 2: Batch convert all pubkeys to npub in a single nak call
+    # nak reads hex pubkeys from stdin (one per line) and outputs npubs
+    if [[ "$can_convert_npub" == true && "$unique_pubkey_count" -gt 0 ]]; then
+        # Create mapping file: pubkey_hex<TAB>npub
+        # nak encode npub reads from stdin and outputs one npub per line
+        paste "$unique_pubkeys_file" <(nak encode npub < "$unique_pubkeys_file" 2>/dev/null) > "$npub_map_file" || {
+            # Fallback: if batch conversion fails, use hex pubkeys
+            log_warn "  Batch npub conversion failed, using hex pubkeys"
+            awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
+        }
+    else
+        # No nak available, use hex pubkeys as-is
+        awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
+    fi
+    
+    rm -f "$unique_pubkeys_file"
+    
+    # STEP 3: Use awk for efficient join (O(n) instead of O(n*m) grep per line)
+    # This joins parse_failures with lookup_file on event_id, then with npub_map on pubkey
     local enriched_file
     enriched_file=$(mktemp)
     
     # Copy header lines
     grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true
     
-    # Process data lines
-    local enriched_count=0
-    local total_count=0
-    while IFS=$'\t' read -r event_id kind reason repo npub; do
-        # Skip header lines (already copied)
-        [[ "$event_id" =~ ^# ]] && continue
-        
-        total_count=$((total_count + 1))
-        
-        # If repo and npub are already populated, keep them
-        if [[ -n "$repo" && -n "$npub" ]]; then
-            printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"
-            continue
-        fi
-        
-        # Look up event_id in our table (format: event_id<TAB>identifier<TAB>pubkey_hex)
-        local lookup_result
-        lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "")
+    # Use awk to perform the join efficiently
+    # Input files (order matters for ARGIND):
+    #   1. npub_map_file: pubkey_hex<TAB>npub
+    #   2. lookup_file: event_id<TAB>identifier<TAB>pubkey_hex
+    #   3. parse_failures_file: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
+    awk -F'\t' -v OFS='\t' '
+        # Track which file we are processing
+        FNR==1 { file_num++ }
         
-        if [[ -n "$lookup_result" ]]; then
-            local looked_up_repo looked_up_pubkey_hex looked_up_npub
-            looked_up_repo=$(echo "$lookup_result" | cut -f2)
-            looked_up_pubkey_hex=$(echo "$lookup_result" | cut -f3)
+        # First file: npub_map (pubkey_hex -> npub)
+        file_num==1 {
+            npub_map[$1] = $2
+            next
+        }
+        # Second file: lookup (event_id -> identifier, pubkey_hex)
+        file_num==2 {
+            lookup_repo[$1] = $2
+            lookup_pubkey[$1] = $3
+            next
+        }
+        # Third file: parse_failures
+        /^#/ { next }  # Skip headers (already copied)
+        {
+            event_id = $1
+            kind = $2
+            reason = $3
+            repo = $4
+            npub = $5
             
-            # Convert hex pubkey to npub if nak is available
-            if [[ "$can_convert_npub" == true && -n "$looked_up_pubkey_hex" ]]; then
-                looked_up_npub=$(nak encode npub "$looked_up_pubkey_hex" 2>/dev/null || echo "$looked_up_pubkey_hex")
-            else
-                looked_up_npub="$looked_up_pubkey_hex"
-            fi
+            # If repo/npub empty, try to enrich from lookup
+            if (repo == "" && event_id in lookup_repo) {
+                repo = lookup_repo[event_id]
+            }
+            if (npub == "" && event_id in lookup_pubkey) {
+                pubkey = lookup_pubkey[event_id]
+                if (pubkey in npub_map) {
+                    npub = npub_map[pubkey]
+                } else {
+                    npub = pubkey  # Fallback to hex
+                }
+            }
             
-            # Use looked-up values if original was empty
-            [[ -z "$repo" ]] && repo="$looked_up_repo"
-            [[ -z "$npub" ]] && npub="$looked_up_npub"
-            enriched_count=$((enriched_count + 1))
-        fi
-        
-        printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"
-    done < "$parse_failures_file"
+            print event_id, kind, reason, repo, npub
+        }
+    ' "$npub_map_file" "$lookup_file" "$parse_failures_file" >> "$enriched_file"
+    
+    rm -f "$npub_map_file"
+    
+    # Count enriched entries
+    local enriched_count total_count
+    total_count=$(grep -v '^#' "$parse_failures_file" | wc -l)
+    total_count="${total_count//[^0-9]/}"
+    # Count entries that have non-empty repo AND npub after enrichment
+    enriched_count=$(grep -v '^#' "$enriched_file" | awk -F'\t' '$4 != "" && $5 != ""' | wc -l)
+    enriched_count="${enriched_count//[^0-9]/}"
     
     # Replace original with enriched version
     mv "$enriched_file" "$parse_failures_file"
@@ -569,32 +657,29 @@ main() {
         echo "# Note: repo and npub may be empty for some entries"
     } > "$output_file"
     
-    # Parse [PARSE_FAIL] entries
+    # Parse [PARSE_FAIL] entries using batch awk processing
     log_info "  Parsing [PARSE_FAIL] entries..."
     local parse_fail_count=0
     if [[ "$parse_fail_line_count" -gt 0 ]]; then
-        while IFS= read -r line; do
-            local parsed
-            parsed=$(parse_parse_fail_line "$line")
-            if [[ -n "$parsed" ]]; then
-                echo "$parsed" >> "$output_file"
-                parse_fail_count=$((parse_fail_count + 1))
-            fi
-        done < "$temp_parse_fail"
+        parse_parse_fail_batch "$temp_parse_fail" >> "$output_file"
+        parse_fail_count=$(grep -v '^#' "$output_file" | wc -l)
+        parse_fail_count="${parse_fail_count//[^0-9]/}"
     fi
     
-    # Parse write policy rejection entries
+    # Parse write policy rejection entries using batch awk processing
     log_info "  Parsing write policy rejection entries..."
     local write_policy_count=0
     if [[ "$write_policy_line_count" -gt 0 ]]; then
-        while IFS= read -r line; do
-            local parsed
-            parsed=$(parse_write_policy_rejection_line "$line")
-            if [[ -n "$parsed" ]]; then
-                echo "$parsed" >> "$output_file"
-                write_policy_count=$((write_policy_count + 1))
-            fi
-        done < "$temp_write_policy_rejection"
+        local before_count
+        before_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0")
+        before_count="${before_count//[^0-9]/}"
+        before_count="${before_count:-0}"
+        parse_write_policy_rejection_batch "$temp_write_policy_rejection" >> "$output_file"
+        local after_count
+        after_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0")
+        after_count="${after_count//[^0-9]/}"
+        after_count="${after_count:-0}"
+        write_policy_count=$((after_count - before_count))
     fi
     
     local invalid_announcement_count=$write_policy_count
@@ -605,13 +690,7 @@ main() {
     
     log_info "  Building enrichment lookup table..."
     if [[ "$rejected_announcement_line_count" -gt 0 ]]; then
-        while IFS= read -r line; do
-            local parsed
-            parsed=$(parse_rejected_announcement_line "$line")
-            if [[ -n "$parsed" ]]; then
-                echo "$parsed" >> "$enrichment_lookup_file"
-            fi
-        done < "$temp_rejected_announcement"
+        parse_rejected_announcement_batch "$temp_rejected_announcement" > "$enrichment_lookup_file"
     fi
     
     rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"
-- 
cgit v1.2.3