diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-23 17:19:48 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-27 20:38:07 +0000 |
| commit | 26e3f24e491ac0b9a61eaa2831de250b68bd9d96 (patch) | |
| tree | a07a4cd7dffe1dd09f4bc8af9b7cc2e699e6db12 /docs | |
| parent | cbb0e768641a6ca0cbd7e7013437cc71b920004d (diff) | |
Fix double-counting bug in parse failures extraction
The script was counting the same invalid announcement twice because:
- Write policy logs use hex event IDs
- Builder logs use note1 (bech32) event IDs
- Deduplication only worked within each format
Fix: Only extract from write policy logs (hex IDs) to avoid the
format mismatch. Builder logs contain the same events, so we don't
lose any data.
Result: 446 entries → 223 unique invalid announcements (correct count)
Diffstat (limited to 'docs')
| -rwxr-xr-x | docs/how-to/migration-scripts/30-extract-parse-failures.sh | 70 |
1 files changed, 17 insertions, 53 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh index 114a44d..7870c61 100755 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh | |||
| @@ -39,8 +39,10 @@ | |||
| 39 | # 2. "Invalid announcement" rejections (write policy): | 39 | # 2. "Invalid announcement" rejections (write policy): |
| 40 | # Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... | 40 | # Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... |
| 41 | # | 41 | # |
| 42 | # 3. "Rejected repository announcement" (builder): | 42 | # NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted |
| 43 | # Rejected repository announcement note1xxx: Invalid announcement: multiple clone tags found... | 43 | # because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting |
| 44 | # both would cause double-counting since deduplication only works within each format. | ||
| 45 | # Write policy logs contain the same events, so we don't lose any data. | ||
| 44 | # | 46 | # |
| 45 | # Required fields: kind, event_id, reason | 47 | # Required fields: kind, event_id, reason |
| 46 | # Optional fields: repo, npub (may not be available for all entry types) | 48 | # Optional fields: repo, npub (may not be available for all entry types) |
| @@ -135,7 +137,6 @@ usage() { | |||
| 135 | echo "Expected log formats:" | 137 | echo "Expected log formats:" |
| 136 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." | 138 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." |
| 137 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." | 139 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." |
| 138 | echo " Rejected repository announcement note1xxx: Invalid announcement: ..." | ||
| 139 | exit 1 | 140 | exit 1 |
| 140 | } | 141 | } |
| 141 | 142 | ||
| @@ -197,28 +198,11 @@ parse_write_policy_rejection_line() { | |||
| 197 | fi | 198 | fi |
| 198 | } | 199 | } |
| 199 | 200 | ||
| 200 | # Parse a "Rejected repository announcement" log line from builder | 201 | # NOTE: parse_builder_rejection_line() was removed to fix double-counting bug. |
| 201 | # Input: log line containing "Rejected repository announcement <note_id>: Invalid announcement:" | 202 | # Builder logs use bech32 (note1) IDs while write policy logs use hex IDs. |
| 202 | # Output: TSV line: event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub | 203 | # Since deduplication only works within each format, extracting both caused |
| 203 | # Note: The note_id is bech32 encoded, we need to extract it | 204 | # the same event to be counted twice. Write policy logs contain the same |
| 204 | parse_builder_rejection_line() { | 205 | # events, so we don't lose any data by only extracting from that source. |
| 205 | local line="$1" | ||
| 206 | |||
| 207 | local note_id reason | ||
| 208 | |||
| 209 | # Extract note_id (note1...) from "Rejected repository announcement note1xxx:" | ||
| 210 | note_id=$(echo "$line" | grep -oP 'Rejected repository announcement \Knote1[a-z0-9]+' || echo "") | ||
| 211 | |||
| 212 | # Extract reason (everything after the note_id and colon) | ||
| 213 | reason=$(echo "$line" | grep -oP 'Rejected repository announcement note1[a-z0-9]+: \K.*$' || echo "") | ||
| 214 | |||
| 215 | # Only output if we have the required fields | ||
| 216 | # Kind is always 30617 for announcements | ||
| 217 | if [[ -n "$note_id" && -n "$reason" ]]; then | ||
| 218 | # Use note_id as event_id (bech32 format), kind=30617, repo and npub empty | ||
| 219 | printf '%s\t%s\t%s\t\t\n' "$note_id" "30617" "$reason" | ||
| 220 | fi | ||
| 221 | } | ||
| 222 | 206 | ||
| 223 | # Main | 207 | # Main |
| 224 | main() { | 208 | main() { |
| @@ -361,11 +345,10 @@ main() { | |||
| 361 | log_info "Extracting log entries..." | 345 | log_info "Extracting log entries..." |
| 362 | 346 | ||
| 363 | # Create temp files for intermediate results | 347 | # Create temp files for intermediate results |
| 364 | local temp_stderr temp_parse_fail temp_write_policy_rejection temp_builder_rejection | 348 | local temp_stderr temp_parse_fail temp_write_policy_rejection |
| 365 | temp_stderr=$(mktemp) | 349 | temp_stderr=$(mktemp) |
| 366 | temp_parse_fail=$(mktemp) | 350 | temp_parse_fail=$(mktemp) |
| 367 | temp_write_policy_rejection=$(mktemp) | 351 | temp_write_policy_rejection=$(mktemp) |
| 368 | temp_builder_rejection=$(mktemp) | ||
| 369 | 352 | ||
| 370 | # Extract [PARSE_FAIL] entries directly to temp file (streaming) | 353 | # Extract [PARSE_FAIL] entries directly to temp file (streaming) |
| 371 | log_info " Searching for [PARSE_FAIL] entries..." | 354 | log_info " Searching for [PARSE_FAIL] entries..." |
| @@ -378,29 +361,24 @@ main() { | |||
| 378 | fi | 361 | fi |
| 379 | 362 | ||
| 380 | # Extract "Event rejected by write policy" with "Invalid announcement" (streaming) | 363 | # Extract "Event rejected by write policy" with "Invalid announcement" (streaming) |
| 364 | # NOTE: We only extract from write policy logs (hex IDs), not builder logs (note1 IDs) | ||
| 365 | # to avoid double-counting. Both log sources contain the same events. | ||
| 381 | log_info " Searching for write policy rejections..." | 366 | log_info " Searching for write policy rejections..." |
| 382 | eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true | 367 | eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true |
| 383 | 368 | ||
| 384 | # Extract "Rejected repository announcement" from builder (streaming) | ||
| 385 | log_info " Searching for builder rejections..." | ||
| 386 | eval "$journal_cmd" 2>/dev/null | grep 'Rejected repository announcement' | grep 'Invalid announcement' > "$temp_builder_rejection" || true | ||
| 387 | |||
| 388 | rm -f "$temp_stderr" | 369 | rm -f "$temp_stderr" |
| 389 | 370 | ||
| 390 | # Check if we found anything | 371 | # Check if we found anything |
| 391 | local parse_fail_line_count write_policy_line_count builder_line_count | 372 | local parse_fail_line_count write_policy_line_count |
| 392 | parse_fail_line_count=$(wc -l < "$temp_parse_fail") | 373 | parse_fail_line_count=$(wc -l < "$temp_parse_fail") |
| 393 | parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" | 374 | parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" |
| 394 | write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") | 375 | write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") |
| 395 | write_policy_line_count="${write_policy_line_count//[^0-9]/}" | 376 | write_policy_line_count="${write_policy_line_count//[^0-9]/}" |
| 396 | builder_line_count=$(wc -l < "$temp_builder_rejection") | ||
| 397 | builder_line_count="${builder_line_count//[^0-9]/}" | ||
| 398 | 377 | ||
| 399 | log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" | 378 | log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" |
| 400 | log_info " Found $write_policy_line_count write policy rejection log lines" | 379 | log_info " Found $write_policy_line_count write policy rejection log lines" |
| 401 | log_info " Found $builder_line_count builder rejection log lines" | ||
| 402 | 380 | ||
| 403 | local total_invalid_announcement_lines=$((write_policy_line_count + builder_line_count)) | 381 | local total_invalid_announcement_lines=$write_policy_line_count |
| 404 | 382 | ||
| 405 | if [[ "$parse_fail_line_count" -eq 0 && "$total_invalid_announcement_lines" -eq 0 ]]; then | 383 | if [[ "$parse_fail_line_count" -eq 0 && "$total_invalid_announcement_lines" -eq 0 ]]; then |
| 406 | log_warn "No matching entries found in logs." | 384 | log_warn "No matching entries found in logs." |
| @@ -431,7 +409,7 @@ main() { | |||
| 431 | echo "# This is expected if ngit-grasp logging improvements are not yet deployed." | 409 | echo "# This is expected if ngit-grasp logging improvements are not yet deployed." |
| 432 | } > "$output_file" | 410 | } > "$output_file" |
| 433 | 411 | ||
| 434 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_builder_rejection" | 412 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" |
| 435 | log_info "Created empty output file: $output_file" | 413 | log_info "Created empty output file: $output_file" |
| 436 | exit 0 | 414 | exit 0 |
| 437 | fi | 415 | fi |
| @@ -478,23 +456,9 @@ main() { | |||
| 478 | done < "$temp_write_policy_rejection" | 456 | done < "$temp_write_policy_rejection" |
| 479 | fi | 457 | fi |
| 480 | 458 | ||
| 481 | # Parse builder rejection entries | 459 | local invalid_announcement_count=$write_policy_count |
| 482 | log_info " Parsing builder rejection entries..." | ||
| 483 | local builder_count=0 | ||
| 484 | if [[ "$builder_line_count" -gt 0 ]]; then | ||
| 485 | while IFS= read -r line; do | ||
| 486 | local parsed | ||
| 487 | parsed=$(parse_builder_rejection_line "$line") | ||
| 488 | if [[ -n "$parsed" ]]; then | ||
| 489 | echo "$parsed" >> "$output_file" | ||
| 490 | builder_count=$((builder_count + 1)) | ||
| 491 | fi | ||
| 492 | done < "$temp_builder_rejection" | ||
| 493 | fi | ||
| 494 | |||
| 495 | local invalid_announcement_count=$((write_policy_count + builder_count)) | ||
| 496 | 460 | ||
| 497 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_builder_rejection" | 461 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" |
| 498 | 462 | ||
| 499 | # Deduplicate by event_id (first column) - keep first occurrence | 463 | # Deduplicate by event_id (first column) - keep first occurrence |
| 500 | log_info " Deduplicating entries..." | 464 | log_info " Deduplicating entries..." |