upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/docs/how-to
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-27 12:46:05 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-27 20:38:22 +0000
commita7d0d574b9788f71e3add39699b3a409c0f2b492 (patch)
treed47ef845ab2cf5183780e8601881d8df22e24e53 /docs/how-to
parent49b0df788255848173c01db394a2df29b7c08576 (diff)
fix migration script for invalid announcement detection
Diffstat (limited to 'docs/how-to')
-rwxr-xr-xdocs/how-to/migration-scripts/30-extract-parse-failures.sh319
1 files changed, 199 insertions, 120 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index d7f9706..d762aae 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -155,70 +155,107 @@ usage() {
155 exit 1 155 exit 1
156} 156}
157 157
158# Parse a [PARSE_FAIL] log line and extract fields 158# =============================================================================
159# Input: log line containing [PARSE_FAIL] 159# AWK-BASED BATCH PARSING FUNCTIONS
160# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub 160# =============================================================================
161parse_parse_fail_line() { 161# These functions use awk for efficient batch processing instead of per-line
162 local line="$1" 162# grep calls. This provides ~400x speedup for large log files.
163 163#
164 # Extract fields using grep -oP (Perl regex) or awk
165 # Fields: kind, event_id, reason, repo (optional), npub (optional)
166
167 local kind event_id reason repo npub
168
169 # Extract kind=VALUE
170 kind=$(echo "$line" | grep -oP 'kind=\K[0-9]+' || echo "")
171
172 # Extract event_id=VALUE (hex string, possibly truncated with ...)
173 event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
174
175 # Extract reason="VALUE" (quoted string)
176 reason=$(echo "$line" | grep -oP 'reason="\K[^"]*' || echo "")
177
178 # Extract repo=VALUE (optional, unquoted identifier)
179 repo=$(echo "$line" | grep -oP 'repo=\K[^ ]+' || echo "")
180
181 # Extract npub=VALUE (optional, npub1... format)
182 npub=$(echo "$line" | grep -oP 'npub=\K[^ ]+' || echo "")
183
184 # Only output if we have the required fields
185 if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
186 printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub"
187 fi
188}
189
190# Parse an "Invalid announcement" rejection log line from write policy
191# Input: log line containing "Event rejected by write policy" with "Invalid announcement"
192# Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
193# Note: repo and npub are empty for these entries (not available in log format)
194parse_write_policy_rejection_line() {
195 local line="$1"
196
197 local kind event_id reason
198
199 # Extract event_id=VALUE (hex string)
200 event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "")
201
202 # Extract kind=VALUE
203 kind=$(echo "$line" | grep -oP 'kind=\K[0-9]+' || echo "")
204
205 # Extract reason=VALUE (everything after "reason=")
206 # The reason is unquoted and goes to end of line
207 reason=$(echo "$line" | grep -oP 'reason=\K.*$' || echo "")
208
209 # Only output if we have the required fields
210 if [[ -n "$kind" && -n "$event_id" && -n "$reason" ]]; then
211 # repo and npub are empty for invalid announcement entries
212 printf '%s\t%s\t%s\t\t\n' "$event_id" "$kind" "$reason"
213 fi
214}
215
216# NOTE: parse_builder_rejection_line() was removed to fix double-counting bug. 164# NOTE: parse_builder_rejection_line() was removed to fix double-counting bug.
217# Builder logs use bech32 (note1) IDs while write policy logs use hex IDs. 165# Builder logs use bech32 (note1) IDs while write policy logs use hex IDs.
218# Since deduplication only works within each format, extracting both caused 166# Since deduplication only works within each format, extracting both caused
219# the same event to be counted twice. Write policy logs contain the same 167# the same event to be counted twice. Write policy logs contain the same
220# events, so we don't lose any data by only extracting from that source. 168# events, so we don't lose any data by only extracting from that source.
221 169
170# Parse [PARSE_FAIL] log lines in batch using awk
171# Input: file containing log lines with [PARSE_FAIL]
172# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
173parse_parse_fail_batch() {
174 local input_file="$1"
175 awk '
176 {
177 # Extract kind=VALUE
178 kind = ""
179 if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
180
181 # Extract event_id=VALUE (hex string)
182 event_id = ""
183 if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
184
185 # Extract reason="VALUE" (quoted string)
186 reason = ""
187 if (match($0, /reason="([^"]*)"/, m)) reason = m[1]
188
189 # Extract repo=VALUE (optional)
190 repo = ""
191 if (match($0, /repo=([^ ]+)/, m)) repo = m[1]
192
193 # Extract npub=VALUE (optional)
194 npub = ""
195 if (match($0, /npub=([^ ]+)/, m)) npub = m[1]
196
197 # Output if we have required fields
198 if (kind != "" && event_id != "" && reason != "") {
199 print event_id "\t" kind "\t" reason "\t" repo "\t" npub
200 }
201 }
202 ' "$input_file"
203}
204
205# Parse "Invalid announcement" rejection log lines in batch using awk
206# Input: file containing "Event rejected by write policy" log lines
207# Output: TSV lines: event_id<TAB>kind<TAB>reason<TAB><empty><TAB><empty>
208parse_write_policy_rejection_batch() {
209 local input_file="$1"
210 awk '
211 {
212 # Extract event_id=VALUE (hex string)
213 event_id = ""
214 if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
215
216 # Extract kind=VALUE
217 kind = ""
218 if (match($0, /kind=([0-9]+)/, m)) kind = m[1]
219
220 # Extract reason=VALUE (everything after "reason=")
221 reason = ""
222 if (match($0, /reason=(.*)$/, m)) reason = m[1]
223
224 # Output if we have required fields (repo and npub are empty)
225 if (kind != "" && event_id != "" && reason != "") {
226 print event_id "\t" kind "\t" reason "\t\t"
227 }
228 }
229 ' "$input_file"
230}
231
232# Parse "Added rejected announcement" log lines in batch using awk
233# Input: file containing "Added rejected announcement to two-tier index" log lines
234# Output: TSV lines: event_id<TAB>identifier<TAB>pubkey_hex
235parse_rejected_announcement_batch() {
236 local input_file="$1"
237 awk '
238 {
239 # Extract event_id=VALUE (hex string)
240 event_id = ""
241 if (match($0, /event_id=([a-f0-9]+)/, m)) event_id = m[1]
242
243 # Extract identifier=VALUE (repo name)
244 identifier = ""
245 if (match($0, /identifier=([^ ]+)/, m)) identifier = m[1]
246
247 # Extract pubkey=VALUE (hex string)
248 pubkey = ""
249 if (match($0, /pubkey=([a-f0-9]+)/, m)) pubkey = m[1]
250
251 # Output if we have all required fields
252 if (event_id != "" && identifier != "" && pubkey != "") {
253 print event_id "\t" identifier "\t" pubkey
254 }
255 }
256 ' "$input_file"
257}
258
222# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries 259# Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries
223# This is critical because "Invalid announcement" rejections only log event_id and kind, 260# This is critical because "Invalid announcement" rejections only log event_id and kind,
224# not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead 261# not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead
@@ -233,6 +270,11 @@ parse_write_policy_rejection_line() {
233# 2. For each parse failure with empty repo/npub, looks up the event_id 270# 2. For each parse failure with empty repo/npub, looks up the event_id
234# 3. Populates repo and npub columns from the lookup 271# 3. Populates repo and npub columns from the lookup
235# 4. Converts hex pubkeys to npub format using `nak encode npub` if available 272# 4. Converts hex pubkeys to npub format using `nak encode npub` if available
273#
274# OPTIMIZATION: This function uses batch processing for efficiency:
275# - Uses awk for O(n) join instead of per-line grep (O(n*m))
276# - Batches all pubkey->npub conversions in a single nak call
277# - This reduces runtime from minutes to seconds for large datasets
236enrich_with_repo_npub() { 278enrich_with_repo_npub() {
237 local parse_failures_file="$1" 279 local parse_failures_file="$1"
238 local lookup_file="$2" 280 local lookup_file="$2"
@@ -259,52 +301,98 @@ enrich_with_repo_npub() {
259 lookup_count="${lookup_count//[^0-9]/}" 301 lookup_count="${lookup_count//[^0-9]/}"
260 log_info " Lookup table has $lookup_count entries" 302 log_info " Lookup table has $lookup_count entries"
261 303
262 # Enrich parse failures 304 # STEP 1: Extract unique pubkeys that need conversion
305 # Get pubkeys from lookup file (column 3), deduplicate
306 local unique_pubkeys_file npub_map_file
307 unique_pubkeys_file=$(mktemp)
308 npub_map_file=$(mktemp)
309
310 cut -f3 "$lookup_file" | sort -u > "$unique_pubkeys_file"
311 local unique_pubkey_count
312 unique_pubkey_count=$(wc -l < "$unique_pubkeys_file")
313 unique_pubkey_count="${unique_pubkey_count//[^0-9]/}"
314 log_info " Converting $unique_pubkey_count unique pubkeys to npub format..."
315
316 # STEP 2: Batch convert all pubkeys to npub in a single nak call
317 # nak reads hex pubkeys from stdin (one per line) and outputs npubs
318 if [[ "$can_convert_npub" == true && "$unique_pubkey_count" -gt 0 ]]; then
319 # Create mapping file: pubkey_hex<TAB>npub
320 # nak encode npub reads from stdin and outputs one npub per line
321 paste "$unique_pubkeys_file" <(nak encode npub < "$unique_pubkeys_file" 2>/dev/null) > "$npub_map_file" || {
322 # Fallback: if batch conversion fails, use hex pubkeys
323 log_warn " Batch npub conversion failed, using hex pubkeys"
324 awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
325 }
326 else
327 # No nak available, use hex pubkeys as-is
328 awk '{print $1 "\t" $1}' "$unique_pubkeys_file" > "$npub_map_file"
329 fi
330
331 rm -f "$unique_pubkeys_file"
332
333 # STEP 3: Use awk for efficient join (O(n) instead of O(n*m) grep per line)
334 # This joins parse_failures with lookup_file on event_id, then with npub_map on pubkey
263 local enriched_file 335 local enriched_file
264 enriched_file=$(mktemp) 336 enriched_file=$(mktemp)
265 337
266 # Copy header lines 338 # Copy header lines
267 grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true 339 grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true
268 340
269 # Process data lines 341 # Use awk to perform the join efficiently
270 local enriched_count=0 342 # Input files (order matters for ARGIND):
271 local total_count=0 343 # 1. npub_map_file: pubkey_hex<TAB>npub
272 while IFS=$'\t' read -r event_id kind reason repo npub; do 344 # 2. lookup_file: event_id<TAB>identifier<TAB>pubkey_hex
273 # Skip header lines (already copied) 345 # 3. parse_failures_file: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub
274 [[ "$event_id" =~ ^# ]] && continue 346 awk -F'\t' -v OFS='\t' '
275 347 # Track which file we are processing
276 total_count=$((total_count + 1)) 348 FNR==1 { file_num++ }
277
278 # If repo and npub are already populated, keep them
279 if [[ -n "$repo" && -n "$npub" ]]; then
280 printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file"
281 continue
282 fi
283
284 # Look up event_id in our table (format: event_id<TAB>identifier<TAB>pubkey_hex)
285 local lookup_result
286 lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "")
287 349
288 if [[ -n "$lookup_result" ]]; then 350 # First file: npub_map (pubkey_hex -> npub)
289 local looked_up_repo looked_up_pubkey_hex looked_up_npub 351 file_num==1 {
290 looked_up_repo=$(echo "$lookup_result" | cut -f2) 352 npub_map[$1] = $2
291 looked_up_pubkey_hex=$(echo "$lookup_result" | cut -f3) 353 next
354 }
355 # Second file: lookup (event_id -> identifier, pubkey_hex)
356 file_num==2 {
357 lookup_repo[$1] = $2
358 lookup_pubkey[$1] = $3
359 next
360 }
361 # Third file: parse_failures
362 /^#/ { next } # Skip headers (already copied)
363 {
364 event_id = $1
365 kind = $2
366 reason = $3
367 repo = $4
368 npub = $5
292 369
293 # Convert hex pubkey to npub if nak is available 370 # If repo/npub empty, try to enrich from lookup
294 if [[ "$can_convert_npub" == true && -n "$looked_up_pubkey_hex" ]]; then 371 if (repo == "" && event_id in lookup_repo) {
295 looked_up_npub=$(nak encode npub "$looked_up_pubkey_hex" 2>/dev/null || echo "$looked_up_pubkey_hex") 372 repo = lookup_repo[event_id]
296 else 373 }
297 looked_up_npub="$looked_up_pubkey_hex" 374 if (npub == "" && event_id in lookup_pubkey) {
298 fi 375 pubkey = lookup_pubkey[event_id]
376 if (pubkey in npub_map) {
377 npub = npub_map[pubkey]
378 } else {
379 npub = pubkey # Fallback to hex
380 }
381 }
299 382
300 # Use looked-up values if original was empty 383 print event_id, kind, reason, repo, npub
301 [[ -z "$repo" ]] && repo="$looked_up_repo" 384 }
302 [[ -z "$npub" ]] && npub="$looked_up_npub" 385 ' "$npub_map_file" "$lookup_file" "$parse_failures_file" >> "$enriched_file"
303 enriched_count=$((enriched_count + 1)) 386
304 fi 387 rm -f "$npub_map_file"
305 388
306 printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$enriched_file" 389 # Count enriched entries
307 done < "$parse_failures_file" 390 local enriched_count total_count
391 total_count=$(grep -v '^#' "$parse_failures_file" | wc -l)
392 total_count="${total_count//[^0-9]/}"
393 # Count entries that have non-empty repo AND npub after enrichment
394 enriched_count=$(grep -v '^#' "$enriched_file" | awk -F'\t' '$4 != "" && $5 != ""' | wc -l)
395 enriched_count="${enriched_count//[^0-9]/}"
308 396
309 # Replace original with enriched version 397 # Replace original with enriched version
310 mv "$enriched_file" "$parse_failures_file" 398 mv "$enriched_file" "$parse_failures_file"
@@ -569,32 +657,29 @@ main() {
569 echo "# Note: repo and npub may be empty for some entries" 657 echo "# Note: repo and npub may be empty for some entries"
570 } > "$output_file" 658 } > "$output_file"
571 659
572 # Parse [PARSE_FAIL] entries 660 # Parse [PARSE_FAIL] entries using batch awk processing
573 log_info " Parsing [PARSE_FAIL] entries..." 661 log_info " Parsing [PARSE_FAIL] entries..."
574 local parse_fail_count=0 662 local parse_fail_count=0
575 if [[ "$parse_fail_line_count" -gt 0 ]]; then 663 if [[ "$parse_fail_line_count" -gt 0 ]]; then
576 while IFS= read -r line; do 664 parse_parse_fail_batch "$temp_parse_fail" >> "$output_file"
577 local parsed 665 parse_fail_count=$(grep -v '^#' "$output_file" | wc -l)
578 parsed=$(parse_parse_fail_line "$line") 666 parse_fail_count="${parse_fail_count//[^0-9]/}"
579 if [[ -n "$parsed" ]]; then
580 echo "$parsed" >> "$output_file"
581 parse_fail_count=$((parse_fail_count + 1))
582 fi
583 done < "$temp_parse_fail"
584 fi 667 fi
585 668
586 # Parse write policy rejection entries 669 # Parse write policy rejection entries using batch awk processing
587 log_info " Parsing write policy rejection entries..." 670 log_info " Parsing write policy rejection entries..."
588 local write_policy_count=0 671 local write_policy_count=0
589 if [[ "$write_policy_line_count" -gt 0 ]]; then 672 if [[ "$write_policy_line_count" -gt 0 ]]; then
590 while IFS= read -r line; do 673 local before_count
591 local parsed 674 before_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0")
592 parsed=$(parse_write_policy_rejection_line "$line") 675 before_count="${before_count//[^0-9]/}"
593 if [[ -n "$parsed" ]]; then 676 before_count="${before_count:-0}"
594 echo "$parsed" >> "$output_file" 677 parse_write_policy_rejection_batch "$temp_write_policy_rejection" >> "$output_file"
595 write_policy_count=$((write_policy_count + 1)) 678 local after_count
596 fi 679 after_count=$(grep -v '^#' "$output_file" 2>/dev/null | wc -l || echo "0")
597 done < "$temp_write_policy_rejection" 680 after_count="${after_count//[^0-9]/}"
681 after_count="${after_count:-0}"
682 write_policy_count=$((after_count - before_count))
598 fi 683 fi
599 684
600 local invalid_announcement_count=$write_policy_count 685 local invalid_announcement_count=$write_policy_count
@@ -605,13 +690,7 @@ main() {
605 690
606 log_info " Building enrichment lookup table..." 691 log_info " Building enrichment lookup table..."
607 if [[ "$rejected_announcement_line_count" -gt 0 ]]; then 692 if [[ "$rejected_announcement_line_count" -gt 0 ]]; then
608 while IFS= read -r line; do 693 parse_rejected_announcement_batch "$temp_rejected_announcement" > "$enrichment_lookup_file"
609 local parsed
610 parsed=$(parse_rejected_announcement_line "$line")
611 if [[ -n "$parsed" ]]; then
612 echo "$parsed" >> "$enrichment_lookup_file"
613 fi
614 done < "$temp_rejected_announcement"
615 fi 694 fi
616 695
617 rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" 696 rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement"