diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-23 17:38:32 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-27 20:38:08 +0000 |
| commit | 0e00db4decfa779c26c6c7648b2badcc5704e6f8 (patch) | |
| tree | b49213872325475aa546ef6a84770f6fbe0ecaea /docs | |
| parent | 26e3f24e491ac0b9a61eaa2831de250b68bd9d96 (diff) | |
Add --analysis-root filter to parse failures script
Filter parse failures to only those for announcements that are in
production but missing from the archive. This eliminates noise from
rejections of events from other relays that don't affect migration.
Before: 223 parse failures (all rejections from all relays)
After: 18 parse failures (only for missing announcements)
The filter works by:
1. Reading missing announcements from comparison data
2. Extracting event IDs from production announcements JSON
3. Filtering parse failures to only matching event IDs
Diffstat (limited to 'docs')
| -rwxr-xr-x | docs/how-to/migration-scripts/30-extract-parse-failures.sh | 178 |
1 files changed, 175 insertions, 3 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh index 7870c61..f821834 100755 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh | |||
| @@ -125,18 +125,25 @@ usage() { | |||
| 125 | echo " output-dir Directory to store extracted log data" | 125 | echo " output-dir Directory to store extracted log data" |
| 126 | echo "" | 126 | echo "" |
| 127 | echo "Options:" | 127 | echo "Options:" |
| 128 | echo " --since <date> Start date (default: 30 days ago)" | 128 | echo " --since <date> Start date (default: 30 days ago)" |
| 129 | echo " --until <date> End date (default: now)" | 129 | echo " --until <date> End date (default: now)" |
| 130 | echo " --dry-run Show what would be extracted without writing" | 130 | echo " --analysis-root <dir> Filter to only missing announcements from analysis" |
| 131 | echo " --dry-run Show what would be extracted without writing" | ||
| 131 | echo "" | 132 | echo "" |
| 132 | echo "Examples:" | 133 | echo "Examples:" |
| 133 | echo " $0 ngit-grasp.service output/logs" | 134 | echo " $0 ngit-grasp.service output/logs" |
| 134 | echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" | 135 | echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" |
| 135 | echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" | 136 | echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" |
| 137 | echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123" | ||
| 136 | echo "" | 138 | echo "" |
| 137 | echo "Expected log formats:" | 139 | echo "Expected log formats:" |
| 138 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." | 140 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." |
| 139 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." | 141 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." |
| 142 | echo "" | ||
| 143 | echo "Filtering with --analysis-root:" | ||
| 144 | echo " When provided, only parse failures for announcements that are in production" | ||
| 145 | echo " but missing from the archive will be included. This filters out rejections" | ||
| 146 | echo " for events from other relays that don't affect the migration." | ||
| 140 | exit 1 | 147 | exit 1 |
| 141 | } | 148 | } |
| 142 | 149 | ||
| @@ -204,6 +211,155 @@ parse_write_policy_rejection_line() { | |||
| 204 | # the same event to be counted twice. Write policy logs contain the same | 211 | # the same event to be counted twice. Write policy logs contain the same |
| 205 | # events, so we don't lose any data by only extracting from that source. | 212 | # events, so we don't lose any data by only extracting from that source. |
| 206 | 213 | ||
| 214 | # Filter parse failures to only those for missing announcements | ||
| 215 | # This is used when --analysis-root is provided to scope results to the migration | ||
| 216 | # | ||
| 217 | # Arguments: | ||
| 218 | # $1 - parse failures file to filter (modified in place) | ||
| 219 | # $2 - analysis root directory containing comparison/ and prod/ subdirs | ||
| 220 | # | ||
| 221 | # The function: | ||
| 222 | # 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt | ||
| 223 | # 2. Extracts pubkey/identifier pairs for those announcements | ||
| 224 | # 3. Reads production announcements from prod/raw/announcements.json | ||
| 225 | # 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs | ||
| 226 | # 5. Filters parse failures to only those event IDs | ||
| 227 | filter_to_missing_announcements() { | ||
| 228 | local parse_failures_file="$1" | ||
| 229 | local analysis_root="$2" | ||
| 230 | |||
| 231 | local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt" | ||
| 232 | local prod_announcements="$analysis_root/prod/raw/announcements.json" | ||
| 233 | |||
| 234 | # Validate required files exist | ||
| 235 | if [[ ! -f "$missing_file" ]]; then | ||
| 236 | log_warn "Missing announcements file not found: $missing_file" | ||
| 237 | log_warn "Skipping filter - all parse failures will be included" | ||
| 238 | return 0 | ||
| 239 | fi | ||
| 240 | |||
| 241 | if [[ ! -f "$prod_announcements" ]]; then | ||
| 242 | log_warn "Production announcements file not found: $prod_announcements" | ||
| 243 | log_warn "Skipping filter - all parse failures will be included" | ||
| 244 | return 0 | ||
| 245 | fi | ||
| 246 | |||
| 247 | # Check if jq is available | ||
| 248 | if ! command -v jq &> /dev/null; then | ||
| 249 | log_warn "jq not found - cannot filter parse failures" | ||
| 250 | log_warn "Install jq or run without --analysis-root" | ||
| 251 | return 0 | ||
| 252 | fi | ||
| 253 | |||
| 254 | log_info "Filtering parse failures to missing announcements only..." | ||
| 255 | |||
| 256 | # Step 1: Extract pubkey/identifier pairs from missing announcements | ||
| 257 | # Format: identifier | npub | prod=complete | archive=missing | ||
| 258 | local missing_pairs_file | ||
| 259 | missing_pairs_file=$(mktemp) | ||
| 260 | |||
| 261 | # Extract identifier and npub, convert npub to hex pubkey for matching | ||
| 262 | while IFS=' | ' read -r identifier npub rest; do | ||
| 263 | # Skip empty lines | ||
| 264 | [[ -z "$identifier" ]] && continue | ||
| 265 | # Trim whitespace | ||
| 266 | identifier=$(echo "$identifier" | xargs) | ||
| 267 | npub=$(echo "$npub" | xargs) | ||
| 268 | echo "${identifier}|${npub}" | ||
| 269 | done < "$missing_file" > "$missing_pairs_file" | ||
| 270 | |||
| 271 | local missing_count | ||
| 272 | missing_count=$(wc -l < "$missing_pairs_file") | ||
| 273 | missing_count="${missing_count//[^0-9]/}" | ||
| 274 | log_info " Found $missing_count missing announcements to filter for" | ||
| 275 | |||
| 276 | # Step 2: Get event IDs from production announcements for these pairs | ||
| 277 | # We need to match on 'd' tag (identifier) and pubkey | ||
| 278 | local missing_event_ids_file | ||
| 279 | missing_event_ids_file=$(mktemp) | ||
| 280 | |||
| 281 | # Create a lookup of identifier|npub -> event_id from production announcements | ||
| 282 | # The JSON has: id, pubkey (hex), tags (array with ["d", identifier]) | ||
| 283 | log_info " Extracting event IDs from production announcements..." | ||
| 284 | |||
| 285 | # Use jq to extract id, pubkey, and d-tag value, then filter | ||
| 286 | # Output format: event_id|identifier|pubkey_hex | ||
| 287 | # Note: The JSON file is NDJSON (newline-delimited), not an array | ||
| 288 | jq -r 'select(.kind == 30617) | | ||
| 289 | .id as $id | | ||
| 290 | .pubkey as $pubkey | | ||
| 291 | (.tags[] | select(.[0] == "d") | .[1]) as $dtag | | ||
| 292 | "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || { | ||
| 293 | log_warn "Failed to parse production announcements JSON" | ||
| 294 | rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" | ||
| 295 | return 0 | ||
| 296 | } | ||
| 297 | |||
| 298 | # Now filter to only event IDs for missing announcements | ||
| 299 | # We need to convert npub to hex pubkey for comparison | ||
| 300 | # npub is bech32, pubkey in JSON is hex | ||
| 301 | # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey | ||
| 302 | # Actually, we need both because same identifier can exist for different pubkeys | ||
| 303 | |||
| 304 | # Create a set of "identifier|pubkey_hex" to match against | ||
| 305 | # First, we need to convert npub to hex - but that requires a tool | ||
| 306 | # Alternative: match on identifier only and accept some false positives | ||
| 307 | # Better: use the comparison file which has npub, and match against announcements | ||
| 308 | |||
| 309 | # Let's match on identifier only for now (simpler, may have minor false positives) | ||
| 310 | # Extract just the identifiers from missing announcements | ||
| 311 | local missing_identifiers_file | ||
| 312 | missing_identifiers_file=$(mktemp) | ||
| 313 | cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file" | ||
| 314 | |||
| 315 | # Filter event IDs to only those with matching identifiers | ||
| 316 | while IFS='|' read -r event_id identifier pubkey_hex; do | ||
| 317 | if grep -qFx "$identifier" "$missing_identifiers_file"; then | ||
| 318 | echo "$event_id" | ||
| 319 | fi | ||
| 320 | done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file" | ||
| 321 | |||
| 322 | local event_id_count | ||
| 323 | event_id_count=$(wc -l < "$missing_event_ids_file") | ||
| 324 | event_id_count="${event_id_count//[^0-9]/}" | ||
| 325 | log_info " Found $event_id_count event IDs for missing announcements" | ||
| 326 | |||
| 327 | # Step 3: Filter parse failures to only those event IDs | ||
| 328 | local filtered_file | ||
| 329 | filtered_file=$(mktemp) | ||
| 330 | |||
| 331 | # Copy header lines | ||
| 332 | grep '^#' "$parse_failures_file" > "$filtered_file" | ||
| 333 | |||
| 334 | # Add a note about filtering | ||
| 335 | echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file" | ||
| 336 | echo "# Analysis root: $analysis_root" >> "$filtered_file" | ||
| 337 | echo "# Missing announcements: $missing_count" >> "$filtered_file" | ||
| 338 | echo "# Matching event IDs: $event_id_count" >> "$filtered_file" | ||
| 339 | |||
| 340 | # Filter data lines - only include if event_id is in our list | ||
| 341 | local filtered_count=0 | ||
| 342 | while IFS=$'\t' read -r event_id kind reason repo npub; do | ||
| 343 | # Skip header lines (already copied) | ||
| 344 | [[ "$event_id" =~ ^# ]] && continue | ||
| 345 | |||
| 346 | # Check if this event_id is in our missing list | ||
| 347 | if grep -qFx "$event_id" "$missing_event_ids_file"; then | ||
| 348 | printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file" | ||
| 349 | filtered_count=$((filtered_count + 1)) | ||
| 350 | fi | ||
| 351 | done < "$parse_failures_file" | ||
| 352 | |||
| 353 | # Replace original with filtered version | ||
| 354 | mv "$filtered_file" "$parse_failures_file" | ||
| 355 | |||
| 356 | # Cleanup temp files | ||
| 357 | rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file" | ||
| 358 | |||
| 359 | log_info " Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures" | ||
| 360 | log_success "Filtered to parse failures for missing announcements only" | ||
| 361 | } | ||
| 362 | |||
| 207 | # Main | 363 | # Main |
| 208 | main() { | 364 | main() { |
| 209 | if [[ $# -lt 2 ]]; then | 365 | if [[ $# -lt 2 ]]; then |
| @@ -219,6 +375,7 @@ main() { | |||
| 219 | since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") | 375 | since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") |
| 220 | local until_date="" | 376 | local until_date="" |
| 221 | local dry_run=false | 377 | local dry_run=false |
| 378 | local analysis_root="" | ||
| 222 | 379 | ||
| 223 | # Parse options | 380 | # Parse options |
| 224 | while [[ $# -gt 0 ]]; do | 381 | while [[ $# -gt 0 ]]; do |
| @@ -231,6 +388,10 @@ main() { | |||
| 231 | until_date="$2" | 388 | until_date="$2" |
| 232 | shift 2 | 389 | shift 2 |
| 233 | ;; | 390 | ;; |
| 391 | --analysis-root) | ||
| 392 | analysis_root="$2" | ||
| 393 | shift 2 | ||
| 394 | ;; | ||
| 234 | --dry-run) | 395 | --dry-run) |
| 235 | dry_run=true | 396 | dry_run=true |
| 236 | shift | 397 | shift |
| @@ -469,6 +630,11 @@ main() { | |||
| 469 | grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" | 630 | grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" |
| 470 | mv "$deduped_file" "$output_file" | 631 | mv "$deduped_file" "$output_file" |
| 471 | 632 | ||
| 633 | # Filter to missing announcements only if analysis root provided | ||
| 634 | if [[ -n "$analysis_root" ]]; then | ||
| 635 | filter_to_missing_announcements "$output_file" "$analysis_root" | ||
| 636 | fi | ||
| 637 | |||
| 472 | # Count final entries (excluding header lines) | 638 | # Count final entries (excluding header lines) |
| 473 | local count | 639 | local count |
| 474 | count=$(grep -v '^#' "$output_file" | wc -l) | 640 | count=$(grep -v '^#' "$output_file" | wc -l) |
| @@ -482,9 +648,15 @@ main() { | |||
| 482 | log_info "=== Extraction Summary ===" | 648 | log_info "=== Extraction Summary ===" |
| 483 | log_info "Service: $service" | 649 | log_info "Service: $service" |
| 484 | log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" | 650 | log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" |
| 651 | if [[ -n "$analysis_root" ]]; then | ||
| 652 | log_info "Filtered to: missing announcements only" | ||
| 653 | fi | ||
| 485 | log_success "Extracted $count total entries" | 654 | log_success "Extracted $count total entries" |
| 486 | log_info " - [PARSE_FAIL] entries: $parse_fail_count" | 655 | log_info " - [PARSE_FAIL] entries: $parse_fail_count" |
| 487 | log_info " - Invalid announcement rejections: $invalid_announcement_count" | 656 | log_info " - Invalid announcement rejections: $invalid_announcement_count" |
| 657 | if [[ -n "$analysis_root" ]]; then | ||
| 658 | log_info " (filtered from original extraction)" | ||
| 659 | fi | ||
| 488 | echo "" | 660 | echo "" |
| 489 | log_info "Output file: $output_file" | 661 | log_info "Output file: $output_file" |
| 490 | 662 | ||