diff options
Diffstat (limited to 'docs/how-to/migration-scripts')
| -rwxr-xr-x | docs/how-to/migration-scripts/30-extract-parse-failures.sh | 332 |
1 files changed, 94 insertions, 238 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh index f86e9f8..d7f9706 100755 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh | |||
| @@ -24,6 +24,12 @@ | |||
| 24 | # --until <date> End date for log extraction (default: now) | 24 | # --until <date> End date for log extraction (default: now) |
| 25 | # --dry-run Show what would be extracted without writing files | 25 | # --dry-run Show what would be extracted without writing files |
| 26 | # | 26 | # |
| 27 | # ENRICHMENT: | ||
| 28 | # The script automatically enriches parse failures with repo/npub information | ||
| 29 | # by extracting from "Added rejected announcement" log entries which include | ||
| 30 | # pubkey and identifier fields. Hex pubkeys are converted to npub format using | ||
| 31 | # `nak encode npub <hex-pubkey>` if the nak tool is available. | ||
| 32 | # | ||
| 27 | # OUTPUT: | 33 | # OUTPUT: |
| 28 | # <output-dir>/parse-failures.txt | 34 | # <output-dir>/parse-failures.txt |
| 29 | # | 35 | # |
| @@ -31,7 +37,7 @@ | |||
| 31 | # event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub | 37 | # event_id<TAB>kind<TAB>reason<TAB>repo<TAB>npub |
| 32 | # | 38 | # |
| 33 | # EXPECTED LOG FORMATS: | 39 | # EXPECTED LOG FORMATS: |
| 34 | # The script looks for two types of log entries: | 40 | # The script looks for three types of log entries: |
| 35 | # | 41 | # |
| 36 | # 1. Structured [PARSE_FAIL] entries: | 42 | # 1. Structured [PARSE_FAIL] entries: |
| 37 | # 2026-01-22T10:30:45Z ngit-grasp[1234]: [PARSE_FAIL] kind=30618 event_id=abc123... reason="invalid refs format" repo=myrepo npub=npub1... | 43 | # 2026-01-22T10:30:45Z ngit-grasp[1234]: [PARSE_FAIL] kind=30618 event_id=abc123... reason="invalid refs format" repo=myrepo npub=npub1... |
| @@ -39,13 +45,17 @@ | |||
| 39 | # 2. "Invalid announcement" rejections (write policy): | 45 | # 2. "Invalid announcement" rejections (write policy): |
| 40 | # Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... | 46 | # Event rejected by write policy event_id=abc123... relay=wss://... kind=30617 reason=Invalid announcement: multiple clone tags found... |
| 41 | # | 47 | # |
| 48 | # 3. "Added rejected announcement" entries (for enrichment): | ||
| 49 | # Added rejected announcement to two-tier index event_id=abc123... kind=30617 identifier=myrepo pubkey=hex... | ||
| 50 | # These entries provide pubkey and identifier for enriching write policy rejections. | ||
| 51 | # | ||
| 42 | # NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted | 52 | # NOTE: Builder logs ("Rejected repository announcement note1xxx:") are NOT extracted |
| 43 | # because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting | 53 | # because they use bech32 (note1) IDs while write policy logs use hex IDs. Extracting |
| 44 | # both would cause double-counting since deduplication only works within each format. | 54 | # both would cause double-counting since deduplication only works within each format. |
| 45 | # Write policy logs contain the same events, so we don't lose any data. | 55 | # Write policy logs contain the same events, so we don't lose any data. |
| 46 | # | 56 | # |
| 47 | # Required fields: kind, event_id, reason | 57 | # Required fields: kind, event_id, reason |
| 48 | # Optional fields: repo, npub (may not be available for all entry types) | 58 | # Enrichment fields: repo (identifier), npub (converted from hex pubkey) |
| 49 | # | 59 | # |
| 50 | # DEPENDENCY: | 60 | # DEPENDENCY: |
| 51 | # This script requires logging improvements in ngit-grasp to emit structured | 61 | # This script requires logging improvements in ngit-grasp to emit structured |
| @@ -127,23 +137,21 @@ usage() { | |||
| 127 | echo "Options:" | 137 | echo "Options:" |
| 128 | echo " --since <date> Start date (default: 30 days ago)" | 138 | echo " --since <date> Start date (default: 30 days ago)" |
| 129 | echo " --until <date> End date (default: now)" | 139 | echo " --until <date> End date (default: now)" |
| 130 | echo " --analysis-root <dir> Filter to only missing announcements from analysis" | ||
| 131 | echo " --dry-run Show what would be extracted without writing" | 140 | echo " --dry-run Show what would be extracted without writing" |
| 132 | echo "" | 141 | echo "" |
| 133 | echo "Examples:" | 142 | echo "Examples:" |
| 134 | echo " $0 ngit-grasp.service output/logs" | 143 | echo " $0 ngit-grasp.service output/logs" |
| 135 | echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" | 144 | echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" |
| 136 | echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" | 145 | echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" |
| 137 | echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123" | ||
| 138 | echo "" | 146 | echo "" |
| 139 | echo "Expected log formats:" | 147 | echo "Expected log formats:" |
| 140 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." | 148 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." |
| 141 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." | 149 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." |
| 142 | echo "" | 150 | echo "" |
| 143 | echo "Filtering with --analysis-root:" | 151 | echo "Enrichment:" |
| 144 | echo " When provided, only parse failures for announcements that are in production" | 152 | echo " Parse failures are automatically enriched with repo/npub from" |
| 145 | echo " but missing from the archive will be included. This filters out rejections" | 153 | echo " 'Added rejected announcement' log entries. Hex pubkeys are converted" |
| 146 | echo " for events from other relays that don't affect the migration." | 154 | echo " to npub format using 'nak encode npub' if available." |
| 147 | exit 1 | 155 | exit 1 |
| 148 | } | 156 | } |
| 149 | 157 | ||
| @@ -211,96 +219,52 @@ parse_write_policy_rejection_line() { | |||
| 211 | # the same event to be counted twice. Write policy logs contain the same | 219 | # the same event to be counted twice. Write policy logs contain the same |
| 212 | # events, so we don't lose any data by only extracting from that source. | 220 | # events, so we don't lose any data by only extracting from that source. |
| 213 | 221 | ||
| 214 | # Enrich parse failures with repo/npub by looking up event_id in announcements.json | 222 | # Enrich parse failures with repo/npub by looking up event_id in "Added rejected announcement" log entries |
| 215 | # This is critical because "Invalid announcement" rejections only log event_id and kind, | 223 | # This is critical because "Invalid announcement" rejections only log event_id and kind, |
| 216 | # not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead | 224 | # not the repo name or npub. Without enrichment, Phase 5 shows event_id|kind instead |
| 217 | # of repo|npub in action-required.txt, making the output unusable. | 225 | # of repo|npub in action-required.txt, making the output unusable. |
| 218 | # | 226 | # |
| 219 | # Arguments: | 227 | # Arguments: |
| 220 | # $1 - parse failures file to enrich (modified in place) | 228 | # $1 - parse failures file to enrich (modified in place) |
| 221 | # $2 - analysis root directory containing prod/raw/announcements.json | 229 | # $2 - lookup file containing event_id -> identifier|pubkey mappings from logs |
| 222 | # | 230 | # |
| 223 | # The function: | 231 | # The function: |
| 224 | # 1. Builds a lookup table from announcements.json: event_id -> repo|npub | 232 | # 1. Uses the lookup table built from "Added rejected announcement" log entries |
| 225 | # 2. For each parse failure with empty repo/npub, looks up the event_id | 233 | # 2. For each parse failure with empty repo/npub, looks up the event_id |
| 226 | # 3. Populates repo and npub columns from the lookup | 234 | # 3. Populates repo and npub columns from the lookup |
| 235 | # 4. Converts hex pubkeys to npub format using `nak encode npub` if available | ||
| 227 | enrich_with_repo_npub() { | 236 | enrich_with_repo_npub() { |
| 228 | local parse_failures_file="$1" | 237 | local parse_failures_file="$1" |
| 229 | local analysis_root="$2" | 238 | local lookup_file="$2" |
| 230 | |||
| 231 | local prod_announcements="$analysis_root/prod/raw/announcements.json" | ||
| 232 | |||
| 233 | # Validate required file exists | ||
| 234 | if [[ ! -f "$prod_announcements" ]]; then | ||
| 235 | log_warn "Production announcements file not found: $prod_announcements" | ||
| 236 | log_warn "Skipping enrichment - repo/npub columns will remain empty" | ||
| 237 | return 0 | ||
| 238 | fi | ||
| 239 | 239 | ||
| 240 | # Check if jq is available | 240 | # Validate lookup file exists and has content |
| 241 | if ! command -v jq &> /dev/null; then | 241 | if [[ ! -f "$lookup_file" ]] || [[ ! -s "$lookup_file" ]]; then |
| 242 | log_warn "jq not found - cannot enrich parse failures with repo/npub" | 242 | log_warn "No enrichment data available - repo/npub columns will remain empty" |
| 243 | log_warn "Install jq or run without --analysis-root" | ||
| 244 | return 0 | 243 | return 0 |
| 245 | fi | 244 | fi |
| 246 | 245 | ||
| 247 | log_info "Enriching parse failures with repo/npub from announcements..." | 246 | log_info "Enriching parse failures with repo/npub from log entries..." |
| 248 | |||
| 249 | # Step 1: Build lookup table from announcements.json | ||
| 250 | # Output format: event_id<TAB>repo<TAB>npub | ||
| 251 | local lookup_file | ||
| 252 | lookup_file=$(mktemp) | ||
| 253 | |||
| 254 | # Extract id, d-tag (repo identifier), and pubkey from announcements | ||
| 255 | # Convert pubkey to npub using bech32 encoding | ||
| 256 | # Note: We use a simple hex-to-npub conversion via external tool if available, | ||
| 257 | # otherwise we'll use the hex pubkey (Phase 5 can still match on it) | ||
| 258 | log_info " Building event_id -> repo/npub lookup table..." | ||
| 259 | |||
| 260 | # First, extract the raw data: id, d-tag, pubkey (hex) | ||
| 261 | jq -r 'select(.kind == 30617) | | ||
| 262 | .id as $id | | ||
| 263 | .pubkey as $pubkey | | ||
| 264 | ((.tags[] | select(.[0] == "d") | .[1]) // "") as $dtag | | ||
| 265 | "\($id)\t\($dtag)\t\($pubkey)"' "$prod_announcements" > "$lookup_file.raw" 2>/dev/null || { | ||
| 266 | log_warn "Failed to parse production announcements JSON" | ||
| 267 | rm -f "$lookup_file" "$lookup_file.raw" | ||
| 268 | return 0 | ||
| 269 | } | ||
| 270 | 247 | ||
| 271 | # Convert hex pubkeys to npub format | 248 | # Check if we have nak for pubkey->npub conversion |
| 272 | # Check if we have a tool to do bech32 encoding (nak, nostr-tool, etc.) | ||
| 273 | local can_convert_npub=false | 249 | local can_convert_npub=false |
| 274 | if command -v nak &> /dev/null; then | 250 | if command -v nak &> /dev/null; then |
| 275 | can_convert_npub=true | 251 | can_convert_npub=true |
| 276 | log_info " Using 'nak' for pubkey->npub conversion" | 252 | log_info " Using 'nak' for pubkey->npub conversion" |
| 253 | else | ||
| 254 | log_warn " 'nak' not found - will use hex pubkeys instead of npub" | ||
| 277 | fi | 255 | fi |
| 278 | 256 | ||
| 279 | # Process the lookup file, converting pubkeys to npubs if possible | ||
| 280 | while IFS=$'\t' read -r event_id repo pubkey_hex; do | ||
| 281 | local npub | ||
| 282 | if [[ "$can_convert_npub" == true && -n "$pubkey_hex" ]]; then | ||
| 283 | # Use nak to encode pubkey as npub | ||
| 284 | npub=$(nak encode npub "$pubkey_hex" 2>/dev/null || echo "") | ||
| 285 | fi | ||
| 286 | # Fall back to hex pubkey if conversion failed | ||
| 287 | [[ -z "$npub" ]] && npub="$pubkey_hex" | ||
| 288 | printf '%s\t%s\t%s\n' "$event_id" "$repo" "$npub" | ||
| 289 | done < "$lookup_file.raw" > "$lookup_file" | ||
| 290 | |||
| 291 | rm -f "$lookup_file.raw" | ||
| 292 | |||
| 293 | local lookup_count | 257 | local lookup_count |
| 294 | lookup_count=$(wc -l < "$lookup_file") | 258 | lookup_count=$(wc -l < "$lookup_file") |
| 295 | lookup_count="${lookup_count//[^0-9]/}" | 259 | lookup_count="${lookup_count//[^0-9]/}" |
| 296 | log_info " Built lookup table with $lookup_count announcements" | 260 | log_info " Lookup table has $lookup_count entries" |
| 297 | 261 | ||
| 298 | # Step 2: Enrich parse failures | 262 | # Enrich parse failures |
| 299 | local enriched_file | 263 | local enriched_file |
| 300 | enriched_file=$(mktemp) | 264 | enriched_file=$(mktemp) |
| 301 | 265 | ||
| 302 | # Copy header lines | 266 | # Copy header lines |
| 303 | grep '^#' "$parse_failures_file" > "$enriched_file" | 267 | grep '^#' "$parse_failures_file" > "$enriched_file" 2>/dev/null || true |
| 304 | 268 | ||
| 305 | # Process data lines | 269 | # Process data lines |
| 306 | local enriched_count=0 | 270 | local enriched_count=0 |
| @@ -317,14 +281,21 @@ enrich_with_repo_npub() { | |||
| 317 | continue | 281 | continue |
| 318 | fi | 282 | fi |
| 319 | 283 | ||
| 320 | # Look up event_id in our table | 284 | # Look up event_id in our table (format: event_id<TAB>identifier<TAB>pubkey_hex) |
| 321 | local lookup_result | 285 | local lookup_result |
| 322 | lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "") | 286 | lookup_result=$(grep "^${event_id}"$'\t' "$lookup_file" 2>/dev/null | head -1 || echo "") |
| 323 | 287 | ||
| 324 | if [[ -n "$lookup_result" ]]; then | 288 | if [[ -n "$lookup_result" ]]; then |
| 325 | local looked_up_repo looked_up_npub | 289 | local looked_up_repo looked_up_pubkey_hex looked_up_npub |
| 326 | looked_up_repo=$(echo "$lookup_result" | cut -f2) | 290 | looked_up_repo=$(echo "$lookup_result" | cut -f2) |
| 327 | looked_up_npub=$(echo "$lookup_result" | cut -f3) | 291 | looked_up_pubkey_hex=$(echo "$lookup_result" | cut -f3) |
| 292 | |||
| 293 | # Convert hex pubkey to npub if nak is available | ||
| 294 | if [[ "$can_convert_npub" == true && -n "$looked_up_pubkey_hex" ]]; then | ||
| 295 | looked_up_npub=$(nak encode npub "$looked_up_pubkey_hex" 2>/dev/null || echo "$looked_up_pubkey_hex") | ||
| 296 | else | ||
| 297 | looked_up_npub="$looked_up_pubkey_hex" | ||
| 298 | fi | ||
| 328 | 299 | ||
| 329 | # Use looked-up values if original was empty | 300 | # Use looked-up values if original was empty |
| 330 | [[ -z "$repo" ]] && repo="$looked_up_repo" | 301 | [[ -z "$repo" ]] && repo="$looked_up_repo" |
| @@ -338,160 +309,31 @@ enrich_with_repo_npub() { | |||
| 338 | # Replace original with enriched version | 309 | # Replace original with enriched version |
| 339 | mv "$enriched_file" "$parse_failures_file" | 310 | mv "$enriched_file" "$parse_failures_file" |
| 340 | 311 | ||
| 341 | # Cleanup | ||
| 342 | rm -f "$lookup_file" | ||
| 343 | |||
| 344 | log_info " Enriched $enriched_count of $total_count parse failures with repo/npub" | 312 | log_info " Enriched $enriched_count of $total_count parse failures with repo/npub" |
| 345 | log_success "Enrichment complete" | 313 | log_success "Enrichment complete" |
| 346 | } | 314 | } |
| 347 | 315 | ||
| 348 | # Filter parse failures to only those for missing announcements | 316 | # Parse "Added rejected announcement" log entries to build enrichment lookup table |
| 349 | # This is used when --analysis-root is provided to scope results to the migration | 317 | # Input: log line containing "Added rejected announcement to two-tier index" |
| 350 | # | 318 | # Output: TSV line: event_id<TAB>identifier<TAB>pubkey_hex |
| 351 | # Arguments: | 319 | parse_rejected_announcement_line() { |
| 352 | # $1 - parse failures file to filter (modified in place) | 320 | local line="$1" |
| 353 | # $2 - analysis root directory containing comparison/ and prod/ subdirs | ||
| 354 | # | ||
| 355 | # The function: | ||
| 356 | # 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt | ||
| 357 | # 2. Extracts pubkey/identifier pairs for those announcements | ||
| 358 | # 3. Reads production announcements from prod/raw/announcements.json | ||
| 359 | # 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs | ||
| 360 | # 5. Filters parse failures to only those event IDs | ||
| 361 | filter_to_missing_announcements() { | ||
| 362 | local parse_failures_file="$1" | ||
| 363 | local analysis_root="$2" | ||
| 364 | |||
| 365 | local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt" | ||
| 366 | local prod_announcements="$analysis_root/prod/raw/announcements.json" | ||
| 367 | |||
| 368 | # Validate required files exist | ||
| 369 | if [[ ! -f "$missing_file" ]]; then | ||
| 370 | log_warn "Missing announcements file not found: $missing_file" | ||
| 371 | log_warn "Skipping filter - all parse failures will be included" | ||
| 372 | return 0 | ||
| 373 | fi | ||
| 374 | |||
| 375 | if [[ ! -f "$prod_announcements" ]]; then | ||
| 376 | log_warn "Production announcements file not found: $prod_announcements" | ||
| 377 | log_warn "Skipping filter - all parse failures will be included" | ||
| 378 | return 0 | ||
| 379 | fi | ||
| 380 | |||
| 381 | # Check if jq is available | ||
| 382 | if ! command -v jq &> /dev/null; then | ||
| 383 | log_warn "jq not found - cannot filter parse failures" | ||
| 384 | log_warn "Install jq or run without --analysis-root" | ||
| 385 | return 0 | ||
| 386 | fi | ||
| 387 | |||
| 388 | log_info "Filtering parse failures to missing announcements only..." | ||
| 389 | |||
| 390 | # Step 1: Extract pubkey/identifier pairs from missing announcements | ||
| 391 | # Format: identifier | npub | prod=complete | archive=missing | ||
| 392 | local missing_pairs_file | ||
| 393 | missing_pairs_file=$(mktemp) | ||
| 394 | |||
| 395 | # Extract identifier and npub, convert npub to hex pubkey for matching | ||
| 396 | while IFS=' | ' read -r identifier npub rest; do | ||
| 397 | # Skip empty lines | ||
| 398 | [[ -z "$identifier" ]] && continue | ||
| 399 | # Trim whitespace | ||
| 400 | identifier=$(echo "$identifier" | xargs) | ||
| 401 | npub=$(echo "$npub" | xargs) | ||
| 402 | echo "${identifier}|${npub}" | ||
| 403 | done < "$missing_file" > "$missing_pairs_file" | ||
| 404 | |||
| 405 | local missing_count | ||
| 406 | missing_count=$(wc -l < "$missing_pairs_file") | ||
| 407 | missing_count="${missing_count//[^0-9]/}" | ||
| 408 | log_info " Found $missing_count missing announcements to filter for" | ||
| 409 | |||
| 410 | # Step 2: Get event IDs from production announcements for these pairs | ||
| 411 | # We need to match on 'd' tag (identifier) and pubkey | ||
| 412 | local missing_event_ids_file | ||
| 413 | missing_event_ids_file=$(mktemp) | ||
| 414 | |||
| 415 | # Create a lookup of identifier|npub -> event_id from production announcements | ||
| 416 | # The JSON has: id, pubkey (hex), tags (array with ["d", identifier]) | ||
| 417 | log_info " Extracting event IDs from production announcements..." | ||
| 418 | |||
| 419 | # Use jq to extract id, pubkey, and d-tag value, then filter | ||
| 420 | # Output format: event_id|identifier|pubkey_hex | ||
| 421 | # Note: The JSON file is NDJSON (newline-delimited), not an array | ||
| 422 | jq -r 'select(.kind == 30617) | | ||
| 423 | .id as $id | | ||
| 424 | .pubkey as $pubkey | | ||
| 425 | (.tags[] | select(.[0] == "d") | .[1]) as $dtag | | ||
| 426 | "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || { | ||
| 427 | log_warn "Failed to parse production announcements JSON" | ||
| 428 | rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" | ||
| 429 | return 0 | ||
| 430 | } | ||
| 431 | |||
| 432 | # Now filter to only event IDs for missing announcements | ||
| 433 | # We need to convert npub to hex pubkey for comparison | ||
| 434 | # npub is bech32, pubkey in JSON is hex | ||
| 435 | # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey | ||
| 436 | # Actually, we need both because same identifier can exist for different pubkeys | ||
| 437 | |||
| 438 | # Create a set of "identifier|pubkey_hex" to match against | ||
| 439 | # First, we need to convert npub to hex - but that requires a tool | ||
| 440 | # Alternative: match on identifier only and accept some false positives | ||
| 441 | # Better: use the comparison file which has npub, and match against announcements | ||
| 442 | |||
| 443 | # Let's match on identifier only for now (simpler, may have minor false positives) | ||
| 444 | # Extract just the identifiers from missing announcements | ||
| 445 | local missing_identifiers_file | ||
| 446 | missing_identifiers_file=$(mktemp) | ||
| 447 | cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file" | ||
| 448 | |||
| 449 | # Filter event IDs to only those with matching identifiers | ||
| 450 | while IFS='|' read -r event_id identifier pubkey_hex; do | ||
| 451 | if grep -qFx "$identifier" "$missing_identifiers_file"; then | ||
| 452 | echo "$event_id" | ||
| 453 | fi | ||
| 454 | done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file" | ||
| 455 | |||
| 456 | local event_id_count | ||
| 457 | event_id_count=$(wc -l < "$missing_event_ids_file") | ||
| 458 | event_id_count="${event_id_count//[^0-9]/}" | ||
| 459 | log_info " Found $event_id_count event IDs for missing announcements" | ||
| 460 | |||
| 461 | # Step 3: Filter parse failures to only those event IDs | ||
| 462 | local filtered_file | ||
| 463 | filtered_file=$(mktemp) | ||
| 464 | |||
| 465 | # Copy header lines | ||
| 466 | grep '^#' "$parse_failures_file" > "$filtered_file" | ||
| 467 | 321 | ||
| 468 | # Add a note about filtering | 322 | local event_id identifier pubkey_hex |
| 469 | echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file" | ||
| 470 | echo "# Analysis root: $analysis_root" >> "$filtered_file" | ||
| 471 | echo "# Missing announcements: $missing_count" >> "$filtered_file" | ||
| 472 | echo "# Matching event IDs: $event_id_count" >> "$filtered_file" | ||
| 473 | 323 | ||
| 474 | # Filter data lines - only include if event_id is in our list | 324 | # Extract event_id=VALUE (hex string) |
| 475 | local filtered_count=0 | 325 | event_id=$(echo "$line" | grep -oP 'event_id=\K[a-f0-9]+' || echo "") |
| 476 | while IFS=$'\t' read -r event_id kind reason repo npub; do | ||
| 477 | # Skip header lines (already copied) | ||
| 478 | [[ "$event_id" =~ ^# ]] && continue | ||
| 479 | |||
| 480 | # Check if this event_id is in our missing list | ||
| 481 | if grep -qFx "$event_id" "$missing_event_ids_file"; then | ||
| 482 | printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file" | ||
| 483 | filtered_count=$((filtered_count + 1)) | ||
| 484 | fi | ||
| 485 | done < "$parse_failures_file" | ||
| 486 | 326 | ||
| 487 | # Replace original with filtered version | 327 | # Extract identifier=VALUE (repo name) |
| 488 | mv "$filtered_file" "$parse_failures_file" | 328 | identifier=$(echo "$line" | grep -oP 'identifier=\K[^ ]+' || echo "") |
| 489 | 329 | ||
| 490 | # Cleanup temp files | 330 | # Extract pubkey=VALUE (hex string) |
| 491 | rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file" | 331 | pubkey_hex=$(echo "$line" | grep -oP 'pubkey=\K[a-f0-9]+' || echo "") |
| 492 | 332 | ||
| 493 | log_info " Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures" | 333 | # Only output if we have all required fields |
| 494 | log_success "Filtered to parse failures for missing announcements only" | 334 | if [[ -n "$event_id" && -n "$identifier" && -n "$pubkey_hex" ]]; then |
| 335 | printf '%s\t%s\t%s\n' "$event_id" "$identifier" "$pubkey_hex" | ||
| 336 | fi | ||
| 495 | } | 337 | } |
| 496 | 338 | ||
| 497 | # Main | 339 | # Main |
| @@ -509,7 +351,6 @@ main() { | |||
| 509 | since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") | 351 | since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") |
| 510 | local until_date="" | 352 | local until_date="" |
| 511 | local dry_run=false | 353 | local dry_run=false |
| 512 | local analysis_root="" | ||
| 513 | 354 | ||
| 514 | # Parse options | 355 | # Parse options |
| 515 | while [[ $# -gt 0 ]]; do | 356 | while [[ $# -gt 0 ]]; do |
| @@ -522,10 +363,6 @@ main() { | |||
| 522 | until_date="$2" | 363 | until_date="$2" |
| 523 | shift 2 | 364 | shift 2 |
| 524 | ;; | 365 | ;; |
| 525 | --analysis-root) | ||
| 526 | analysis_root="$2" | ||
| 527 | shift 2 | ||
| 528 | ;; | ||
| 529 | --dry-run) | 366 | --dry-run) |
| 530 | dry_run=true | 367 | dry_run=true |
| 531 | shift | 368 | shift |
| @@ -640,10 +477,11 @@ main() { | |||
| 640 | log_info "Extracting log entries..." | 477 | log_info "Extracting log entries..." |
| 641 | 478 | ||
| 642 | # Create temp files for intermediate results | 479 | # Create temp files for intermediate results |
| 643 | local temp_stderr temp_parse_fail temp_write_policy_rejection | 480 | local temp_stderr temp_parse_fail temp_write_policy_rejection temp_rejected_announcement |
| 644 | temp_stderr=$(mktemp) | 481 | temp_stderr=$(mktemp) |
| 645 | temp_parse_fail=$(mktemp) | 482 | temp_parse_fail=$(mktemp) |
| 646 | temp_write_policy_rejection=$(mktemp) | 483 | temp_write_policy_rejection=$(mktemp) |
| 484 | temp_rejected_announcement=$(mktemp) | ||
| 647 | 485 | ||
| 648 | # Extract [PARSE_FAIL] entries directly to temp file (streaming) | 486 | # Extract [PARSE_FAIL] entries directly to temp file (streaming) |
| 649 | log_info " Searching for [PARSE_FAIL] entries..." | 487 | log_info " Searching for [PARSE_FAIL] entries..." |
| @@ -661,17 +499,25 @@ main() { | |||
| 661 | log_info " Searching for write policy rejections..." | 499 | log_info " Searching for write policy rejections..." |
| 662 | eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true | 500 | eval "$journal_cmd" 2>/dev/null | grep 'Event rejected by write policy' | grep 'Invalid announcement' > "$temp_write_policy_rejection" || true |
| 663 | 501 | ||
| 502 | # Extract "Added rejected announcement" entries for enrichment (streaming) | ||
| 503 | # These contain pubkey and identifier which we use to enrich write policy rejections | ||
| 504 | log_info " Searching for rejected announcement entries (for enrichment)..." | ||
| 505 | eval "$journal_cmd" 2>/dev/null | grep 'Added rejected announcement to two-tier index' > "$temp_rejected_announcement" || true | ||
| 506 | |||
| 664 | rm -f "$temp_stderr" | 507 | rm -f "$temp_stderr" |
| 665 | 508 | ||
| 666 | # Check if we found anything | 509 | # Check if we found anything |
| 667 | local parse_fail_line_count write_policy_line_count | 510 | local parse_fail_line_count write_policy_line_count rejected_announcement_line_count |
| 668 | parse_fail_line_count=$(wc -l < "$temp_parse_fail") | 511 | parse_fail_line_count=$(wc -l < "$temp_parse_fail") |
| 669 | parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" | 512 | parse_fail_line_count="${parse_fail_line_count//[^0-9]/}" |
| 670 | write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") | 513 | write_policy_line_count=$(wc -l < "$temp_write_policy_rejection") |
| 671 | write_policy_line_count="${write_policy_line_count//[^0-9]/}" | 514 | write_policy_line_count="${write_policy_line_count//[^0-9]/}" |
| 515 | rejected_announcement_line_count=$(wc -l < "$temp_rejected_announcement") | ||
| 516 | rejected_announcement_line_count="${rejected_announcement_line_count//[^0-9]/}" | ||
| 672 | 517 | ||
| 673 | log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" | 518 | log_info " Found $parse_fail_line_count [PARSE_FAIL] log lines" |
| 674 | log_info " Found $write_policy_line_count write policy rejection log lines" | 519 | log_info " Found $write_policy_line_count write policy rejection log lines" |
| 520 | log_info " Found $rejected_announcement_line_count rejected announcement log lines (for enrichment)" | ||
| 675 | 521 | ||
| 676 | local total_invalid_announcement_lines=$write_policy_line_count | 522 | local total_invalid_announcement_lines=$write_policy_line_count |
| 677 | 523 | ||
| @@ -704,7 +550,7 @@ main() { | |||
| 704 | echo "# This is expected if ngit-grasp logging improvements are not yet deployed." | 550 | echo "# This is expected if ngit-grasp logging improvements are not yet deployed." |
| 705 | } > "$output_file" | 551 | } > "$output_file" |
| 706 | 552 | ||
| 707 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" | 553 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" |
| 708 | log_info "Created empty output file: $output_file" | 554 | log_info "Created empty output file: $output_file" |
| 709 | exit 0 | 555 | exit 0 |
| 710 | fi | 556 | fi |
| @@ -753,7 +599,22 @@ main() { | |||
| 753 | 599 | ||
| 754 | local invalid_announcement_count=$write_policy_count | 600 | local invalid_announcement_count=$write_policy_count |
| 755 | 601 | ||
| 756 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" | 602 | # Build enrichment lookup table from "Added rejected announcement" entries |
| 603 | local enrichment_lookup_file | ||
| 604 | enrichment_lookup_file=$(mktemp) | ||
| 605 | |||
| 606 | log_info " Building enrichment lookup table..." | ||
| 607 | if [[ "$rejected_announcement_line_count" -gt 0 ]]; then | ||
| 608 | while IFS= read -r line; do | ||
| 609 | local parsed | ||
| 610 | parsed=$(parse_rejected_announcement_line "$line") | ||
| 611 | if [[ -n "$parsed" ]]; then | ||
| 612 | echo "$parsed" >> "$enrichment_lookup_file" | ||
| 613 | fi | ||
| 614 | done < "$temp_rejected_announcement" | ||
| 615 | fi | ||
| 616 | |||
| 617 | rm -f "$temp_parse_fail" "$temp_write_policy_rejection" "$temp_rejected_announcement" | ||
| 757 | 618 | ||
| 758 | # Deduplicate by event_id (first column) - keep first occurrence | 619 | # Deduplicate by event_id (first column) - keep first occurrence |
| 759 | log_info " Deduplicating entries..." | 620 | log_info " Deduplicating entries..." |
| @@ -764,17 +625,18 @@ main() { | |||
| 764 | grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" | 625 | grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" |
| 765 | mv "$deduped_file" "$output_file" | 626 | mv "$deduped_file" "$output_file" |
| 766 | 627 | ||
| 767 | # Enrich with repo/npub from announcements.json if analysis root provided | 628 | # Deduplicate enrichment lookup table by event_id |
| 629 | if [[ -s "$enrichment_lookup_file" ]]; then | ||
| 630 | sort -t$'\t' -k1,1 -u "$enrichment_lookup_file" > "$enrichment_lookup_file.deduped" | ||
| 631 | mv "$enrichment_lookup_file.deduped" "$enrichment_lookup_file" | ||
| 632 | fi | ||
| 633 | |||
| 634 | # Enrich with repo/npub from "Added rejected announcement" log entries | ||
| 768 | # This is critical for usability - without it, action-required.txt shows | 635 | # This is critical for usability - without it, action-required.txt shows |
| 769 | # event_id|kind instead of repo|npub, making parse failures unidentifiable | 636 | # event_id|kind instead of repo|npub, making parse failures unidentifiable |
| 770 | if [[ -n "$analysis_root" ]]; then | 637 | enrich_with_repo_npub "$output_file" "$enrichment_lookup_file" |
| 771 | enrich_with_repo_npub "$output_file" "$analysis_root" | ||
| 772 | fi | ||
| 773 | 638 | ||
| 774 | # Filter to missing announcements only if analysis root provided | 639 | rm -f "$enrichment_lookup_file" |
| 775 | if [[ -n "$analysis_root" ]]; then | ||
| 776 | filter_to_missing_announcements "$output_file" "$analysis_root" | ||
| 777 | fi | ||
| 778 | 640 | ||
| 779 | # Count final entries (excluding header lines) | 641 | # Count final entries (excluding header lines) |
| 780 | local count | 642 | local count |
| @@ -789,15 +651,9 @@ main() { | |||
| 789 | log_info "=== Extraction Summary ===" | 651 | log_info "=== Extraction Summary ===" |
| 790 | log_info "Service: $service" | 652 | log_info "Service: $service" |
| 791 | log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" | 653 | log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" |
| 792 | if [[ -n "$analysis_root" ]]; then | ||
| 793 | log_info "Filtered to: missing announcements only" | ||
| 794 | fi | ||
| 795 | log_success "Extracted $count total entries" | 654 | log_success "Extracted $count total entries" |
| 796 | log_info " - [PARSE_FAIL] entries: $parse_fail_count" | 655 | log_info " - [PARSE_FAIL] entries: $parse_fail_count" |
| 797 | log_info " - Invalid announcement rejections: $invalid_announcement_count" | 656 | log_info " - Invalid announcement rejections: $invalid_announcement_count" |
| 798 | if [[ -n "$analysis_root" ]]; then | ||
| 799 | log_info " (filtered from original extraction)" | ||
| 800 | fi | ||
| 801 | echo "" | 657 | echo "" |
| 802 | log_info "Output file: $output_file" | 658 | log_info "Output file: $output_file" |
| 803 | 659 | ||