upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-01-23 17:38:32 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-01-27 20:38:08 +0000
commit0e00db4decfa779c26c6c7648b2badcc5704e6f8 (patch)
treeb49213872325475aa546ef6a84770f6fbe0ecaea
parent26e3f24e491ac0b9a61eaa2831de250b68bd9d96 (diff)
Add --analysis-root filter to parse failures script
Filter parse failures to only those for announcements that are in production but missing from the archive. This eliminates noise from rejections of events from other relays that don't affect migration. Before: 223 parse failures (all rejections from all relays) After: 18 parse failures (only for missing announcements) The filter works by: 1. Reading missing announcements from comparison data 2. Extracting event IDs from production announcements JSON 3. Filtering parse failures to only matching event IDs
-rwxr-xr-xdocs/how-to/migration-scripts/30-extract-parse-failures.sh178
1 files changed, 175 insertions, 3 deletions
diff --git a/docs/how-to/migration-scripts/30-extract-parse-failures.sh b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
index 7870c61..f821834 100755
--- a/docs/how-to/migration-scripts/30-extract-parse-failures.sh
+++ b/docs/how-to/migration-scripts/30-extract-parse-failures.sh
@@ -125,18 +125,25 @@ usage() {
125 echo " output-dir Directory to store extracted log data" 125 echo " output-dir Directory to store extracted log data"
126 echo "" 126 echo ""
127 echo "Options:" 127 echo "Options:"
128 echo " --since <date> Start date (default: 30 days ago)" 128 echo " --since <date> Start date (default: 30 days ago)"
129 echo " --until <date> End date (default: now)" 129 echo " --until <date> End date (default: now)"
130 echo " --dry-run Show what would be extracted without writing" 130 echo " --analysis-root <dir> Filter to only missing announcements from analysis"
131 echo " --dry-run Show what would be extracted without writing"
131 echo "" 132 echo ""
132 echo "Examples:" 133 echo "Examples:"
133 echo " $0 ngit-grasp.service output/logs" 134 echo " $0 ngit-grasp.service output/logs"
134 echo " $0 ngit-grasp.service output/logs --since '2026-01-01'" 135 echo " $0 ngit-grasp.service output/logs --since '2026-01-01'"
135 echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'" 136 echo " $0 ngit-grasp.service output/logs --since '2026-01-15' --until '2026-01-22'"
137 echo " $0 ngit-grasp.service output/logs --analysis-root /tmp/migration-analysis-20260123"
136 echo "" 138 echo ""
137 echo "Expected log formats:" 139 echo "Expected log formats:"
138 echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..." 140 echo " [PARSE_FAIL] kind=30618 event_id=abc123 reason=\"...\" repo=myrepo npub=npub1..."
139 echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..." 141 echo " Event rejected by write policy event_id=abc123 ... kind=30617 reason=Invalid announcement: ..."
142 echo ""
143 echo "Filtering with --analysis-root:"
144 echo " When provided, only parse failures for announcements that are in production"
145 echo " but missing from the archive will be included. This filters out rejections"
146 echo " for events from other relays that don't affect the migration."
140 exit 1 147 exit 1
141} 148}
142 149
@@ -204,6 +211,155 @@ parse_write_policy_rejection_line() {
204# the same event to be counted twice. Write policy logs contain the same 211# the same event to be counted twice. Write policy logs contain the same
205# events, so we don't lose any data by only extracting from that source. 212# events, so we don't lose any data by only extracting from that source.
206 213
214# Filter parse failures to only those for missing announcements
215# This is used when --analysis-root is provided to scope results to the migration
216#
217# Arguments:
218# $1 - parse failures file to filter (modified in place)
219# $2 - analysis root directory containing comparison/ and prod/ subdirs
220#
221# The function:
222# 1. Reads missing announcements from comparison/complete-prod-missing-archive.txt
223# 2. Extracts pubkey/identifier pairs for those announcements
224# 3. Reads production announcements from prod/raw/announcements.json
225# 4. Gets event IDs for announcements matching the missing pubkey/identifier pairs
226# 5. Filters parse failures to only those event IDs
227filter_to_missing_announcements() {
228 local parse_failures_file="$1"
229 local analysis_root="$2"
230
231 local missing_file="$analysis_root/comparison/complete-prod-missing-archive.txt"
232 local prod_announcements="$analysis_root/prod/raw/announcements.json"
233
234 # Validate required files exist
235 if [[ ! -f "$missing_file" ]]; then
236 log_warn "Missing announcements file not found: $missing_file"
237 log_warn "Skipping filter - all parse failures will be included"
238 return 0
239 fi
240
241 if [[ ! -f "$prod_announcements" ]]; then
242 log_warn "Production announcements file not found: $prod_announcements"
243 log_warn "Skipping filter - all parse failures will be included"
244 return 0
245 fi
246
247 # Check if jq is available
248 if ! command -v jq &> /dev/null; then
249 log_warn "jq not found - cannot filter parse failures"
250 log_warn "Install jq or run without --analysis-root"
251 return 0
252 fi
253
254 log_info "Filtering parse failures to missing announcements only..."
255
256 # Step 1: Extract pubkey/identifier pairs from missing announcements
257 # Format: identifier | npub | prod=complete | archive=missing
258 local missing_pairs_file
259 missing_pairs_file=$(mktemp)
260
261 # Extract identifier and npub, convert npub to hex pubkey for matching
262 while IFS=' | ' read -r identifier npub rest; do
263 # Skip empty lines
264 [[ -z "$identifier" ]] && continue
265 # Trim whitespace
266 identifier=$(echo "$identifier" | xargs)
267 npub=$(echo "$npub" | xargs)
268 echo "${identifier}|${npub}"
269 done < "$missing_file" > "$missing_pairs_file"
270
271 local missing_count
272 missing_count=$(wc -l < "$missing_pairs_file")
273 missing_count="${missing_count//[^0-9]/}"
274 log_info " Found $missing_count missing announcements to filter for"
275
276 # Step 2: Get event IDs from production announcements for these pairs
277 # We need to match on 'd' tag (identifier) and pubkey
278 local missing_event_ids_file
279 missing_event_ids_file=$(mktemp)
280
281 # Create a lookup of identifier|npub -> event_id from production announcements
282 # The JSON has: id, pubkey (hex), tags (array with ["d", identifier])
283 log_info " Extracting event IDs from production announcements..."
284
285 # Use jq to extract id, pubkey, and d-tag value, then filter
286 # Output format: event_id|identifier|pubkey_hex
287 # Note: The JSON file is NDJSON (newline-delimited), not an array
288 jq -r 'select(.kind == 30617) |
289 .id as $id |
290 .pubkey as $pubkey |
291 (.tags[] | select(.[0] == "d") | .[1]) as $dtag |
292 "\($id)|\($dtag)|\($pubkey)"' "$prod_announcements" > "$missing_event_ids_file.all" 2>/dev/null || {
293 log_warn "Failed to parse production announcements JSON"
294 rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all"
295 return 0
296 }
297
298 # Now filter to only event IDs for missing announcements
299 # We need to convert npub to hex pubkey for comparison
300 # npub is bech32, pubkey in JSON is hex
301 # For simplicity, we'll match on identifier only (d-tag) since it should be unique per pubkey
302 # Actually, we need both because same identifier can exist for different pubkeys
303
304 # Create a set of "identifier|pubkey_hex" to match against
305 # First, we need to convert npub to hex - but that requires a tool
306 # Alternative: match on identifier only and accept some false positives
307 # Better: use the comparison file which has npub, and match against announcements
308
309 # Let's match on identifier only for now (simpler, may have minor false positives)
310 # Extract just the identifiers from missing announcements
311 local missing_identifiers_file
312 missing_identifiers_file=$(mktemp)
313 cut -d'|' -f1 "$missing_pairs_file" | sort -u > "$missing_identifiers_file"
314
315 # Filter event IDs to only those with matching identifiers
316 while IFS='|' read -r event_id identifier pubkey_hex; do
317 if grep -qFx "$identifier" "$missing_identifiers_file"; then
318 echo "$event_id"
319 fi
320 done < "$missing_event_ids_file.all" | sort -u > "$missing_event_ids_file"
321
322 local event_id_count
323 event_id_count=$(wc -l < "$missing_event_ids_file")
324 event_id_count="${event_id_count//[^0-9]/}"
325 log_info " Found $event_id_count event IDs for missing announcements"
326
327 # Step 3: Filter parse failures to only those event IDs
328 local filtered_file
329 filtered_file=$(mktemp)
330
331 # Copy header lines
332 grep '^#' "$parse_failures_file" > "$filtered_file"
333
334 # Add a note about filtering
335 echo "# Filtered to missing announcements only (--analysis-root)" >> "$filtered_file"
336 echo "# Analysis root: $analysis_root" >> "$filtered_file"
337 echo "# Missing announcements: $missing_count" >> "$filtered_file"
338 echo "# Matching event IDs: $event_id_count" >> "$filtered_file"
339
340 # Filter data lines - only include if event_id is in our list
341 local filtered_count=0
342 while IFS=$'\t' read -r event_id kind reason repo npub; do
343 # Skip header lines (already copied)
344 [[ "$event_id" =~ ^# ]] && continue
345
346 # Check if this event_id is in our missing list
347 if grep -qFx "$event_id" "$missing_event_ids_file"; then
348 printf '%s\t%s\t%s\t%s\t%s\n' "$event_id" "$kind" "$reason" "$repo" "$npub" >> "$filtered_file"
349 filtered_count=$((filtered_count + 1))
350 fi
351 done < "$parse_failures_file"
352
353 # Replace original with filtered version
354 mv "$filtered_file" "$parse_failures_file"
355
356 # Cleanup temp files
357 rm -f "$missing_pairs_file" "$missing_event_ids_file" "$missing_event_ids_file.all" "$missing_identifiers_file"
358
359 log_info " Filtered from $(grep -v '^#' "$parse_failures_file" | wc -l | xargs) to $filtered_count parse failures"
360 log_success "Filtered to parse failures for missing announcements only"
361}
362
207# Main 363# Main
208main() { 364main() {
209 if [[ $# -lt 2 ]]; then 365 if [[ $# -lt 2 ]]; then
@@ -219,6 +375,7 @@ main() {
219 since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "") 375 since_date=$(date -d "30 days ago" "+%Y-%m-%d" 2>/dev/null || date -v-30d "+%Y-%m-%d" 2>/dev/null || echo "")
220 local until_date="" 376 local until_date=""
221 local dry_run=false 377 local dry_run=false
378 local analysis_root=""
222 379
223 # Parse options 380 # Parse options
224 while [[ $# -gt 0 ]]; do 381 while [[ $# -gt 0 ]]; do
@@ -231,6 +388,10 @@ main() {
231 until_date="$2" 388 until_date="$2"
232 shift 2 389 shift 2
233 ;; 390 ;;
391 --analysis-root)
392 analysis_root="$2"
393 shift 2
394 ;;
234 --dry-run) 395 --dry-run)
235 dry_run=true 396 dry_run=true
236 shift 397 shift
@@ -469,6 +630,11 @@ main() {
469 grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file" 630 grep -v '^#' "$output_file" | sort -t$'\t' -k1,1 -u >> "$deduped_file"
470 mv "$deduped_file" "$output_file" 631 mv "$deduped_file" "$output_file"
471 632
633 # Filter to missing announcements only if analysis root provided
634 if [[ -n "$analysis_root" ]]; then
635 filter_to_missing_announcements "$output_file" "$analysis_root"
636 fi
637
472 # Count final entries (excluding header lines) 638 # Count final entries (excluding header lines)
473 local count 639 local count
474 count=$(grep -v '^#' "$output_file" | wc -l) 640 count=$(grep -v '^#' "$output_file" | wc -l)
@@ -482,9 +648,15 @@ main() {
482 log_info "=== Extraction Summary ===" 648 log_info "=== Extraction Summary ==="
483 log_info "Service: $service" 649 log_info "Service: $service"
484 log_info "Time range: ${since_date:-beginning} to ${until_date:-now}" 650 log_info "Time range: ${since_date:-beginning} to ${until_date:-now}"
651 if [[ -n "$analysis_root" ]]; then
652 log_info "Filtered to: missing announcements only"
653 fi
485 log_success "Extracted $count total entries" 654 log_success "Extracted $count total entries"
486 log_info " - [PARSE_FAIL] entries: $parse_fail_count" 655 log_info " - [PARSE_FAIL] entries: $parse_fail_count"
487 log_info " - Invalid announcement rejections: $invalid_announcement_count" 656 log_info " - Invalid announcement rejections: $invalid_announcement_count"
657 if [[ -n "$analysis_root" ]]; then
658 log_info " (filtered from original extraction)"
659 fi
488 echo "" 660 echo ""
489 log_info "Output file: $output_file" 661 log_info "Output file: $output_file"
490 662