diff options
Diffstat (limited to 'docs/how-to/migration-scripts')
| -rwxr-xr-x | docs/how-to/migration-scripts/30-extract-parse-failures.sh | 178 |
1 files changed, 175 insertions, 3 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh index 7870c61..f821834 100755 --- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh +++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh | |||
| @@ -125,18 +125,25 @@ usage() { | |||
| 125 | echo " output-dir Directory to store extracted log data" | 125 | echo " output-dir Directory to store extracted log data" |
| 126 | echo "" | 126 | echo "" |
| 127 | echo "Options:" | 127 | echo "Options:" |
| 128 | echo " --since <date> Start date (default: 30 days ago)" | 128 | echo " --since <date> Start date (default: 30 days ago)" |
| 129 | echo " --until <date> End date (default: now)" | 129 | echo " --until <date> End date (default: now)" |
| 130 | echo " --dry-run Show what would be extracted without writing" | 130 | echo " --analysis-root <dir> Filter to only missing announcements from analysis" |
| 131 | echo " --dry-run Show what would be extracted without writing" | ||
| 131 | echo "" | 132 | echo "" |
| 132 | echo "Examples:" | 133 | echo "Examples:" |
| 133 | echo " $0 ngit-grasp.service output/logs" | 134 | echo " $0 ngit-grasp.service output/logs" |
| 134 | echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" | 135 | echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" |
| 135 | echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" | 136 | echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" |
| 137 | echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123" | ||
| 136 | echo "" | 138 | echo "" |
| 137 | echo "Expected log formats:" | 139 | echo "Expected log formats:" |
| 138 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." | 140 | echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." |
| 139 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." | 141 | echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." |
| 142 | echo "" | ||
| 143 | echo "Filtering with --analysis-root:" | ||
| 144 | echo " When provided, only parse failures for announcements that are in production" | ||
| 145 | echo " but missing from the archive will be included. This filters out rejections" | ||
| 146 | echo " for events from other relays that don't affect the migration." | ||
| 140 | exit 1 | 147 | exit 1 |
| 141 | } | 148 | } |
| 142 | 149 | ||
| @@ -204,6 +211,155 @@ parse_write_policy_rejection_line() { | |||
| 204 | # the same event to be counted twice. Write policy logs contain the same | 211 | # the same event to be counted twice. Write policy logs contain the same |
| 205 | # events, so we don't lose any data by only extracting from that source. | 212 | # events, so we don't lose any data by only extracting from that source. |
| 206 | 213 | ||
| 214 | # Filter parse failures to only those for missing announcements | ||
| 215 | # This is used when --analysis-root is provided to scope results to the migration | ||
| 216 | # | ||
| 217 | # Arguments: | ||
| 218 | # $1 - parse failures file to filter (modified in place) | ||
| 219 | # $2 - analysis root directory containing comparison/ and prod/ subdirs | ||
| 220 | # | ||
| 221 | # The function: | ||
| 222 | # 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt | ||
| 223 | # 2. Extracts pubkey/identifier pairs for those announcements | ||
| 224 | # 3. Reads production announcements from prod/raw/announcements.json | ||
| 225 | # 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs | ||
| 226 | # 5. Filters parse failures to only those event IDs | ||
| 227 | filter_to_missing_announcements() { | ||
| 228 | local parse_failures_file="$1" | ||
| 229 | local analysis_root="$2" | ||
| 230 | |||
| 231 | local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt" | ||
| 232 | local prod_announcements="$analysis_root/prod/raw/announcements.json" | ||
| 233 | |||
| 234 | # Validate required files exist | ||
| 235 | if [[ ! -f "$missing_file" ]]; then | ||
| 236 | log_warn "Missing announcements file not found: $missing_file" | ||
| 237 | log_warn "Skipping filter - all parse failures will be included" | ||
| 238 | return 0 | ||
| 239 | fi | ||
| 240 | |||
| 241 | if [[ ! -f "$prod_announcements" ]]; then | ||
| 242 | log_warn "Production announcements file not found: $prod_announcements" | ||
| 243 | log_warn "Skipping filter - all parse failures will be included" | ||
| 244 | return 0 | ||
| 245 | fi | ||
| 246 | |||
| 247 | # Check if jq is available | ||
| 248 | if ! command -v jq &> /dev/null; then | ||
| 249 | log_warn "jq not found - cannot filter parse failures" | ||
| 250 | log_warn "Install jq or run without --analysis-root" | ||
| 251 | return 0 | ||
| 252 | fi | ||
| 253 | |||
| 254 | log_info "Filtering parse failures to missing announcements only..." | ||
| 255 | |||
| 256 | # Step 1: Extract pubkey/identifier pairs from missing announcements | ||
| 257 | # Format: identifier | npub | prod=complete | archive=missing | ||
| 258 | local missing_pairs_file | ||
| 259 | missing_pairs_file=$(mktemp) | ||
| 260 | |||
| 261 | # Extract identifier and npub, convert npub to hex pubkey for matching | ||
| 262 | while IFS=' | ' read -r identifier npub rest; do | ||
| 263 | # Skip empty lines | ||
| 264 | [[ -z "$identifier" ]] && continue | ||
| 265 | # Trim whitespace | ||
| 266 | identifier=$(echo "$identifier" | xargs) | ||
| 267 | npub=$(echo "$npub" | xargs) | ||
| 268 | echo "${identifier}|${npub}" | ||
| 269 | done < "$missing_file" > "$missing_pairs_file" | ||
| 270 | |||
| 271 | local missing_count | ||
| 272 | missing_count=$(wc -l < "$missing_pairs_file") | ||
| 273 | missing_count="${missing_count//[^0-9]/}" | ||
| 274 | log_info " Found $missing_count missing announcements to filter for" | ||
| 275 | |||
| 276 | # Step 2: Get event IDs from production announcements for these pairs | ||
| 277 | # We need to match on 'd' tag (identifier) and pubkey | ||
| 278 | local missing_event_ids_file | ||
| 279 | missing_event_ids_file=$(mktemp) | ||
| 280 | |||
| 281 | # Create a lookup of identifier|npub -> event_id from production announcements | ||
| 282 | # The JSON has: id, pubkey (hex), tags (array with ["d", identifier]) | ||
| 283 | log_info " Extracting event IDs from production announcements..." | ||
| 284 | |||
| 285 | # Use jq to extract id, pubkey, and d-tag value, then filter | ||
| 286 | # Output format: event_id|identifier|pubkey_hex | ||
| 287 | # Note: The JSON file is NDJSON (newline-delimited), not an array | ||
| 288 | jq -r 'select(.kind == 30617) | | ||
| 289 | .id as $id | | ||
| 290 | .pubkey as $pubkey | | ||
| 291 | (.tags[] | select(.[0] == "d") | .[1]) as $dtag | | ||
| 292 | "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || { | ||
| 293 | log_warn "Failed to parse production announcements JSON" | ||
| 294 | rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" | ||
| 295 | return 0 | ||
| 296 | } | ||
| 297 | |||
| 298 | # Now filter to only event IDs for missing announcements | ||
| 299 | # We need to convert npub to hex pubkey for comparison | ||
| 300 | # npub is bech32, pubkey in JSON is hex | ||
| 301 | # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey | ||
| 302 | # Actually, we need both because same identifier can exist for different pubkeys | ||
| 303 | |||
| 304 | # Create a set of "identifier|pubkey_hex" to match against | ||
| 305 | # First, we need to convert npub to hex - but that requires a tool | ||
| 306 | # Alternative: match on identifier only and accept some false positives | ||
| 307 | # Better: use the comparison file which has npub, and match against announcements | ||
| 308 | |||
| 309 | # Let's match on identifier only for now (simpler, may have minor false positives) | ||
| 310 | # Extract just the identifiers from missing announcements | ||
| 311 | local missing_identifiers_file | ||
| 312 | missing_identifiers_file=$(mktemp) | ||
| 313 | cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file" | ||
| 314 | |||
| 315 | # Filter event IDs to only those with matching identifiers | ||
| 316 | while IFS='|' read -r event_id identifier pubkey_hex; do | ||
| 317 | if grep -qFx "$identifier" "$missing_identifiers_file"; then | ||
| 318 | echo "$event_id" | ||
| 319 | fi | ||
| 320 | done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file" | ||
| 321 | |||
| 322 | local event_id_count | ||
| 323 | event_id_count=$(wc -l < "$missing_event_ids_file") | ||
| 324 | event_id_count="${event_id_count//[^0-9]/}" | ||
| 325 | log_info " Found $event_id_count event IDs for missing announcements" | ||
| 326 | |||
| 327 | # Step 3: Filter parse failures to only those event IDs | ||
| 328 | local filtered_file | ||
| 329 | filtered_file=$(mktemp) | ||
| 330 | |||
| 331 | # Copy header lines | ||
| 332 | grep '^#' "$parse_failures_file" > "$filtered_file" | ||
| 333 | |||
| 334 | # Add a note about filtering | ||
| 335 | echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file" | ||
| 336 | echo "# Analysis root: $analysis_root" >> "$filtered_file" | ||
| 337 | echo "# Missing announcements: $missing_count" >> "$filtered_file" | ||
| 338 | echo "# Matching event IDs: $event_id_count" >> "$filtered_file" | ||
| 339 | |||
| 340 | # Filter data lines - only include if event_id is in our list | ||
| 341 | local filtered_count=0 | ||
| 342 | while IFS=$'\t' read -r event_id kind reason repo npub; do | ||
| 343 | # Skip header lines (already copied) | ||
| 344 | [[ "$event_id" =~ ^# ]] && continue | ||
| 345 | |||
| 346 | # Check if this event_id is in our missing list | ||
| 347 | if grep -qFx "$event_id" "$missing_event_ids_file"; then | ||
| 348 | printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file" | ||
| 349 | filtered_count=$((filtered_count + 1)) | ||
| 350 | fi | ||
| 351 | done < "$parse_failures_file" | ||
| 352 | |||
| 353 | # Replace original with filtered version | ||
| 354 | mv "$filtered_file" "$parse_failures_file" | ||
| 355 | |||
| 356 | # Cleanup temp files | ||
| 357 | rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file" | ||
| 358 | |||
| 359 | log_info " Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures" | ||
| 360 | log_success "Filtered to parse failures for missing announcements only" | ||
| 361 | } | ||
| 362 | |||
| 207 | # Main | 363 | # Main |
| 208 | main() { | 364 | main() { |
| 209 | if [[ $# -lt 2 ]]; then | 365 | if [[ $# -lt 2 ]]; then |
| @@ -219,6 +375,7 @@ main() { | |||
| 219 | since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") | 375 | since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") |
| 220 | local until_date="" | 376 | local until_date="" |
| 221 | local dry_run=false | 377 | local dry_run=false |
| 378 | local analysis_root="" | ||
| 222 | 379 | ||
| 223 | # Parse options | 380 | # Parse options |
| 224 | while [[ $# -gt 0 ]]; do | 381 | while [[ $# -gt 0 ]]; do |
| @@ -231,6 +388,10 @@ main() { | |||
| 231 | until_date="$2" | 388 | until_date="$2" |
| 232 | shift 2 | 389 | shift 2 |
| 233 | ;; | 390 | ;; |
| 391 | --analysis-root) | ||
| 392 | analysis_root="$2" | ||
| 393 | shift 2 | ||
| 394 | ;; | ||
| 234 | --dry-run) | 395 | --dry-run) |
| 235 | dry_run=true | 396 | dry_run=true |
| 236 | shift | 397 | shift |
| @@ -469,6 +630,11 @@ main() { | |||
| 469 | grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" | 630 | grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" |
| 470 | mv "$deduped_file" "$output_file" | 631 | mv "$deduped_file" "$output_file" |
| 471 | 632 | ||
| 633 | # Filter to missing announcements only if analysis root provided | ||
| 634 | if [[ -n "$analysis_root" ]]; then | ||
| 635 | filter_to_missing_announcements "$output_file" "$analysis_root" | ||
| 636 | fi | ||
| 637 | |||
| 472 | # Count final entries (excluding header lines) | 638 | # Count final entries (excluding header lines) |
| 473 | local count | 639 | local count |
| 474 | count=$(grep -v '^#' "$output_file" | wc -l) | 640 | count=$(grep -v '^#' "$output_file" | wc -l) |
| @@ -482,9 +648,15 @@ main() { | |||
| 482 | log_info "=== Extraction Summary ===" | 648 | log_info "=== Extraction Summary ===" |
| 483 | log_info "Service: $service" | 649 | log_info "Service: $service" |
| 484 | log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" | 650 | log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" |
| 651 | if [[ -n "$analysis_root" ]]; then | ||
| 652 | log_info "Filtered to: missing announcements only" | ||
| 653 | fi | ||
| 485 | log_success "Extracted $count total entries" | 654 | log_success "Extracted $count total entries" |
| 486 | log_info " - [PARSE_FAIL] entries: $parse_fail_count" | 655 | log_info " - [PARSE_FAIL] entries: $parse_fail_count" |
| 487 | log_info " - Invalid announcement rejections: $invalid_announcement_count" | 656 | log_info " - Invalid announcement rejections: $invalid_announcement_count" |
| 657 | if [[ -n "$analysis_root" ]]; then | ||
| 658 | log_info " (filtered from original extraction)" | ||
| 659 | fi | ||
| 488 | echo "" | 660 | echo "" |
| 489 | log_info "Output file: $output_file" | 661 | log_info "Output file: $output_file" |
| 490 | 662 | ||