diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-02-18 20:25:02 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-02-18 20:25:02 +0000 |
| commit | 061589cd88d0480dc7cb0b9eb19a3910293ceb56 (patch) | |
| tree | 549d33eb3dac9dd7068745015663ef36ad63aaa5 /src | |
| parent | fcff4541e1f36b6575596c353637b25aeae9bdcf (diff) | |
fix: improve mbox patch parser resilience for optional tag fallback
- Use mailparse crate to handle RFC 2047 encoded-words (Q/B encoding)
and RFC 2822 header folding in Subject and From headers
- Fix email signature separator check: use exact match 'line == "-- "'
instead of starts_with to avoid false positives on body lines
- Remove dead/incorrect asctime parsing in committer date extraction;
simplify to always return None (falls back to author_timestamp)
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/mbox_parser.rs | 163 |
1 files changed, 135 insertions, 28 deletions
diff --git a/src/lib/mbox_parser.rs b/src/lib/mbox_parser.rs index 40603b1..fd2f8ed 100644 --- a/src/lib/mbox_parser.rs +++ b/src/lib/mbox_parser.rs | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | use anyhow::{Context, Result, bail}; | 1 | use anyhow::{Context, Result, bail}; |
| 2 | use chrono::{DateTime, Datelike}; | 2 | use chrono::DateTime; |
| 3 | use mailparse::{MailHeaderMap, parse_headers}; | ||
| 3 | 4 | ||
| 4 | #[derive(Debug, Clone, PartialEq)] | 5 | #[derive(Debug, Clone, PartialEq)] |
| 5 | pub struct PatchMetadata { | 6 | pub struct PatchMetadata { |
| @@ -17,7 +18,7 @@ pub fn parse_mbox_patch(content: &str) -> Result<PatchMetadata> { | |||
| 17 | let commit_id = extract_commit_id_from_mbox(content)?; | 18 | let commit_id = extract_commit_id_from_mbox(content)?; |
| 18 | let (author_name, author_email) = extract_author_from_from_header(content)?; | 19 | let (author_name, author_email) = extract_author_from_from_header(content)?; |
| 19 | let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?; | 20 | let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?; |
| 20 | let committer_timestamp = extract_committer_date_from_mbox(content)?; | 21 | let committer_timestamp = None; |
| 21 | let subject = extract_subject(content)?; | 22 | let subject = extract_subject(content)?; |
| 22 | let body = extract_commit_message_body(content)?; | 23 | let body = extract_commit_message_body(content)?; |
| 23 | 24 | ||
| @@ -48,7 +49,33 @@ fn extract_commit_id_from_mbox(content: &str) -> Result<String> { | |||
| 48 | Ok(parts[1].to_string()) | 49 | Ok(parts[1].to_string()) |
| 49 | } | 50 | } |
| 50 | 51 | ||
| 52 | /// Extract the header section from the mbox content (everything after the first | ||
| 53 | /// line up to the first blank line that ends the headers). | ||
| 54 | fn extract_header_section(content: &str) -> &str { | ||
| 55 | // Skip the mbox envelope line ("From <sha> <date>"), then pass the rest | ||
| 56 | // to mailparse which understands where headers end. | ||
| 57 | let after_envelope = content | ||
| 58 | .find('\n') | ||
| 59 | .map(|pos| &content[pos + 1..]) | ||
| 60 | .unwrap_or(""); | ||
| 61 | // Return only up to (and including) the blank line that terminates headers, | ||
| 62 | // so mailparse doesn't try to parse the diff body. | ||
| 63 | let header_end = after_envelope | ||
| 64 | .find("\n\n") | ||
| 65 | .map(|pos| pos + 2) | ||
| 66 | .unwrap_or(after_envelope.len()); | ||
| 67 | &after_envelope[..header_end] | ||
| 68 | } | ||
| 69 | |||
| 51 | fn extract_author_from_from_header(content: &str) -> Result<(String, String)> { | 70 | fn extract_author_from_from_header(content: &str) -> Result<(String, String)> { |
| 71 | let header_bytes = extract_header_section(content).as_bytes(); | ||
| 72 | if let Ok((headers, _)) = parse_headers(header_bytes) { | ||
| 73 | if let Some(from_value) = headers.get_first_value("From") { | ||
| 74 | return parse_from_header_value(&from_value); | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | // Fallback: manual search | ||
| 52 | let from_line = content | 79 | let from_line = content |
| 53 | .lines() | 80 | .lines() |
| 54 | .find(|line| line.starts_with("From:")) | 81 | .find(|line| line.starts_with("From:")) |
| @@ -105,34 +132,16 @@ fn parse_rfc2822_date(value: &str) -> Result<(i64, i32)> { | |||
| 105 | Ok((timestamp, offset_minutes)) | 132 | Ok((timestamp, offset_minutes)) |
| 106 | } | 133 | } |
| 107 | 134 | ||
| 108 | fn extract_committer_date_from_mbox(content: &str) -> Result<Option<i64>> { | 135 | fn extract_subject(content: &str) -> Result<String> { |
| 109 | let first_line = content.lines().next().context("patch content is empty")?; | 136 | // Use mailparse to handle RFC 2047 encoded-words and RFC 2822 header folding. |
| 110 | 137 | let header_bytes = extract_header_section(content).as_bytes(); | |
| 111 | let parts: Vec<&str> = first_line.split_whitespace().collect(); | 138 | if let Ok((headers, _)) = parse_headers(header_bytes) { |
| 112 | 139 | if let Some(subject_value) = headers.get_first_value("Subject") { | |
| 113 | if parts.len() >= 6 { | 140 | return Ok(cleanup_subject(&subject_value)); |
| 114 | let date_str = parts[3..6].join(" "); | ||
| 115 | if let Ok(dt) = DateTime::parse_from_rfc2822(&date_str) { | ||
| 116 | return Ok(Some(dt.timestamp())); | ||
| 117 | } | ||
| 118 | } | ||
| 119 | |||
| 120 | if parts.len() >= 7 { | ||
| 121 | let date_str = format!("{} {} {}", parts[3], parts[4], parts[5]); | ||
| 122 | if let Ok(dt) = chrono::DateTime::parse_from_str(&date_str, "%a %b %d") { | ||
| 123 | if let Ok(year) = parts[6].parse::<i32>() { | ||
| 124 | let with_year = dt.with_year(year); | ||
| 125 | if let Some(dt_with_year) = with_year { | ||
| 126 | return Ok(Some(dt_with_year.timestamp())); | ||
| 127 | } | ||
| 128 | } | ||
| 129 | } | 141 | } |
| 130 | } | 142 | } |
| 131 | 143 | ||
| 132 | Ok(None) | 144 | // Fallback: manual single-line extraction. |
| 133 | } | ||
| 134 | |||
| 135 | fn extract_subject(content: &str) -> Result<String> { | ||
| 136 | let subject_line = content | 145 | let subject_line = content |
| 137 | .lines() | 146 | .lines() |
| 138 | .find(|line| line.starts_with("Subject:")) | 147 | .find(|line| line.starts_with("Subject:")) |
| @@ -200,7 +209,10 @@ fn extract_commit_message_body(content: &str) -> Result<String> { | |||
| 200 | break; | 209 | break; |
| 201 | } | 210 | } |
| 202 | 211 | ||
| 203 | if line.starts_with("-- ") || line.starts_with("--\n") { | 212 | // The email signature separator is exactly "-- " (dash dash space, nothing |
| 213 | // after). Lines that merely start with "-- " followed by other text are | ||
| 214 | // body content. | ||
| 215 | if line == "-- " { | ||
| 204 | break; | 216 | break; |
| 205 | } | 217 | } |
| 206 | 218 | ||
| @@ -369,6 +381,58 @@ Body | |||
| 369 | } | 381 | } |
| 370 | 382 | ||
| 371 | #[test] | 383 | #[test] |
| 384 | fn parse_subject_folded_rfc2822() { | ||
| 385 | // RFC 2822 header folding: continuation lines start with whitespace. | ||
| 386 | let patch = "\ | ||
| 387 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 388 | From: Joe <joe@example.com> | ||
| 389 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 390 | Subject: [PATCH] fix: this is a very long commit message subject line | ||
| 391 | that has been folded across two lines by RFC 2822 rules | ||
| 392 | |||
| 393 | Body | ||
| 394 | "; | ||
| 395 | let subject = extract_subject(patch).unwrap(); | ||
| 396 | assert_eq!( | ||
| 397 | subject, | ||
| 398 | "fix: this is a very long commit message subject line that has been folded across two lines by RFC 2822 rules" | ||
| 399 | ); | ||
| 400 | } | ||
| 401 | |||
| 402 | #[test] | ||
| 403 | fn parse_subject_mime_q_encoded() { | ||
| 404 | // RFC 2047 Q-encoding: =?UTF-8?q?...?= | ||
| 405 | let patch = "\ | ||
| 406 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 407 | From: Joe <joe@example.com> | ||
| 408 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 409 | Subject: [PATCH] =?UTF-8?q?fix=3A_add_=E2=9C=93_check?= | ||
| 410 | |||
| 411 | Body | ||
| 412 | "; | ||
| 413 | let subject = extract_subject(patch).unwrap(); | ||
| 414 | // Q-decoded: "fix: add ✓ check" | ||
| 415 | assert_eq!(subject, "fix: add \u{2713} check"); | ||
| 416 | } | ||
| 417 | |||
| 418 | #[test] | ||
| 419 | fn parse_subject_mime_b_encoded() { | ||
| 420 | // RFC 2047 B-encoding: =?UTF-8?b?...?= (base64) | ||
| 421 | // "fix: résumé" base64 encoded | ||
| 422 | let patch = "\ | ||
| 423 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 424 | From: Joe <joe@example.com> | ||
| 425 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 426 | Subject: [PATCH] =?UTF-8?b?Zml4OiByw6lzdW3DqQ==?= | ||
| 427 | |||
| 428 | Body | ||
| 429 | "; | ||
| 430 | let subject = extract_subject(patch).unwrap(); | ||
| 431 | // B-decoded: "fix: résumé" | ||
| 432 | assert_eq!(subject, "fix: r\u{e9}sum\u{e9}"); | ||
| 433 | } | ||
| 434 | |||
| 435 | #[test] | ||
| 372 | fn parse_body() { | 436 | fn parse_body() { |
| 373 | let patch = sample_patch(); | 437 | let patch = sample_patch(); |
| 374 | let body = extract_commit_message_body(&patch).unwrap(); | 438 | let body = extract_commit_message_body(&patch).unwrap(); |
| @@ -395,6 +459,48 @@ diff --git a/file.txt b/file.txt | |||
| 395 | } | 459 | } |
| 396 | 460 | ||
| 397 | #[test] | 461 | #[test] |
| 462 | fn parse_body_stops_at_exact_email_sig_separator() { | ||
| 463 | // "-- " (dash dash space, nothing after) is the email sig separator. | ||
| 464 | let patch = "\ | ||
| 465 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 466 | From: Joe <joe@example.com> | ||
| 467 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 468 | Subject: [PATCH] test | ||
| 469 | |||
| 470 | This is the body. | ||
| 471 | -- | ||
| 472 | libgit2 1.9.1 | ||
| 473 | |||
| 474 | diff --git a/file.txt b/file.txt | ||
| 475 | "; | ||
| 476 | let body = extract_commit_message_body(patch).unwrap(); | ||
| 477 | assert_eq!(body, "This is the body."); | ||
| 478 | } | ||
| 479 | |||
| 480 | #[test] | ||
| 481 | fn parse_body_does_not_stop_at_double_dash_with_text() { | ||
| 482 | // "-- some text" must NOT be treated as an email sig separator. | ||
| 483 | let patch = "\ | ||
| 484 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 485 | From: Joe <joe@example.com> | ||
| 486 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 487 | Subject: [PATCH] test | ||
| 488 | |||
| 489 | This is the body. | ||
| 490 | -- some CLI flag description | ||
| 491 | More body text. | ||
| 492 | |||
| 493 | --- | ||
| 494 | diff --git a/file.txt b/file.txt | ||
| 495 | "; | ||
| 496 | let body = extract_commit_message_body(patch).unwrap(); | ||
| 497 | assert_eq!( | ||
| 498 | body, | ||
| 499 | "This is the body.\n-- some CLI flag description\nMore body text." | ||
| 500 | ); | ||
| 501 | } | ||
| 502 | |||
| 503 | #[test] | ||
| 398 | fn parse_full_metadata() { | 504 | fn parse_full_metadata() { |
| 399 | let patch = sample_patch(); | 505 | let patch = sample_patch(); |
| 400 | let metadata = parse_mbox_patch(&patch).unwrap(); | 506 | let metadata = parse_mbox_patch(&patch).unwrap(); |
| @@ -407,6 +513,7 @@ diff --git a/file.txt b/file.txt | |||
| 407 | assert_eq!(metadata.author_email, "joe.bloggs@pm.me"); | 513 | assert_eq!(metadata.author_email, "joe.bloggs@pm.me"); |
| 408 | assert_eq!(metadata.author_timestamp, 0); | 514 | assert_eq!(metadata.author_timestamp, 0); |
| 409 | assert_eq!(metadata.author_offset_minutes, 0); | 515 | assert_eq!(metadata.author_offset_minutes, 0); |
| 516 | assert_eq!(metadata.committer_timestamp, None); | ||
| 410 | assert_eq!(metadata.subject, "add t2.md"); | 517 | assert_eq!(metadata.subject, "add t2.md"); |
| 411 | assert_eq!( | 518 | assert_eq!( |
| 412 | metadata.body, | 519 | metadata.body, |