upleb.uk

Public git repos — served from a NIP-34 GRASP relay at git.upleb.uk

summaryrefslogtreecommitdiff
path: root/src/lib/mbox_parser.rs
diff options
context:
space:
mode:
authorDanConwayDev <DanConwayDev@protonmail.com>2026-02-18 20:25:02 +0000
committerDanConwayDev <DanConwayDev@protonmail.com>2026-02-18 20:25:02 +0000
commit061589cd88d0480dc7cb0b9eb19a3910293ceb56 (patch)
tree549d33eb3dac9dd7068745015663ef36ad63aaa5 /src/lib/mbox_parser.rs
parentfcff4541e1f36b6575596c353637b25aeae9bdcf (diff)
fix: improve mbox patch parser resilience for optional tag fallback
- Use mailparse crate to handle RFC 2047 encoded-words (Q/B encoding) and RFC 2822 header folding in Subject and From headers - Fix email signature separator check: use exact match 'line == "-- "' instead of starts_with to avoid false positives on body lines - Remove dead/incorrect asctime parsing in committer date extraction; simplify to always return None (falls back to author_timestamp)
Diffstat (limited to 'src/lib/mbox_parser.rs')
-rw-r--r--src/lib/mbox_parser.rs163
1 files changed, 135 insertions, 28 deletions
diff --git a/src/lib/mbox_parser.rs b/src/lib/mbox_parser.rs
index 40603b1..fd2f8ed 100644
--- a/src/lib/mbox_parser.rs
+++ b/src/lib/mbox_parser.rs
@@ -1,5 +1,6 @@
1use anyhow::{Context, Result, bail}; 1use anyhow::{Context, Result, bail};
2use chrono::{DateTime, Datelike}; 2use chrono::DateTime;
3use mailparse::{MailHeaderMap, parse_headers};
3 4
4#[derive(Debug, Clone, PartialEq)] 5#[derive(Debug, Clone, PartialEq)]
5pub struct PatchMetadata { 6pub struct PatchMetadata {
@@ -17,7 +18,7 @@ pub fn parse_mbox_patch(content: &str) -> Result<PatchMetadata> {
17 let commit_id = extract_commit_id_from_mbox(content)?; 18 let commit_id = extract_commit_id_from_mbox(content)?;
18 let (author_name, author_email) = extract_author_from_from_header(content)?; 19 let (author_name, author_email) = extract_author_from_from_header(content)?;
19 let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?; 20 let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?;
20 let committer_timestamp = extract_committer_date_from_mbox(content)?; 21 let committer_timestamp = None;
21 let subject = extract_subject(content)?; 22 let subject = extract_subject(content)?;
22 let body = extract_commit_message_body(content)?; 23 let body = extract_commit_message_body(content)?;
23 24
@@ -48,7 +49,33 @@ fn extract_commit_id_from_mbox(content: &str) -> Result<String> {
48 Ok(parts[1].to_string()) 49 Ok(parts[1].to_string())
49} 50}
50 51
52/// Extract the header section from the mbox content (everything after the first
53/// line up to the first blank line that ends the headers).
54fn extract_header_section(content: &str) -> &str {
55 // Skip the mbox envelope line ("From <sha> <date>"), then pass the rest
56 // to mailparse which understands where headers end.
57 let after_envelope = content
58 .find('\n')
59 .map(|pos| &content[pos + 1..])
60 .unwrap_or("");
61 // Return only up to (and including) the blank line that terminates headers,
62 // so mailparse doesn't try to parse the diff body.
63 let header_end = after_envelope
64 .find("\n\n")
65 .map(|pos| pos + 2)
66 .unwrap_or(after_envelope.len());
67 &after_envelope[..header_end]
68}
69
51fn extract_author_from_from_header(content: &str) -> Result<(String, String)> { 70fn extract_author_from_from_header(content: &str) -> Result<(String, String)> {
71 let header_bytes = extract_header_section(content).as_bytes();
72 if let Ok((headers, _)) = parse_headers(header_bytes) {
73 if let Some(from_value) = headers.get_first_value("From") {
74 return parse_from_header_value(&from_value);
75 }
76 }
77
78 // Fallback: manual search
52 let from_line = content 79 let from_line = content
53 .lines() 80 .lines()
54 .find(|line| line.starts_with("From:")) 81 .find(|line| line.starts_with("From:"))
@@ -105,34 +132,16 @@ fn parse_rfc2822_date(value: &str) -> Result<(i64, i32)> {
105 Ok((timestamp, offset_minutes)) 132 Ok((timestamp, offset_minutes))
106} 133}
107 134
108fn extract_committer_date_from_mbox(content: &str) -> Result<Option<i64>> { 135fn extract_subject(content: &str) -> Result<String> {
109 let first_line = content.lines().next().context("patch content is empty")?; 136 // Use mailparse to handle RFC 2047 encoded-words and RFC 2822 header folding.
110 137 let header_bytes = extract_header_section(content).as_bytes();
111 let parts: Vec<&str> = first_line.split_whitespace().collect(); 138 if let Ok((headers, _)) = parse_headers(header_bytes) {
112 139 if let Some(subject_value) = headers.get_first_value("Subject") {
113 if parts.len() >= 6 { 140 return Ok(cleanup_subject(&subject_value));
114 let date_str = parts[3..6].join(" ");
115 if let Ok(dt) = DateTime::parse_from_rfc2822(&date_str) {
116 return Ok(Some(dt.timestamp()));
117 }
118 }
119
120 if parts.len() >= 7 {
121 let date_str = format!("{} {} {}", parts[3], parts[4], parts[5]);
122 if let Ok(dt) = chrono::DateTime::parse_from_str(&date_str, "%a %b %d") {
123 if let Ok(year) = parts[6].parse::<i32>() {
124 let with_year = dt.with_year(year);
125 if let Some(dt_with_year) = with_year {
126 return Ok(Some(dt_with_year.timestamp()));
127 }
128 }
129 } 141 }
130 } 142 }
131 143
132 Ok(None) 144 // Fallback: manual single-line extraction.
133}
134
135fn extract_subject(content: &str) -> Result<String> {
136 let subject_line = content 145 let subject_line = content
137 .lines() 146 .lines()
138 .find(|line| line.starts_with("Subject:")) 147 .find(|line| line.starts_with("Subject:"))
@@ -200,7 +209,10 @@ fn extract_commit_message_body(content: &str) -> Result<String> {
200 break; 209 break;
201 } 210 }
202 211
203 if line.starts_with("-- ") || line.starts_with("--\n") { 212 // The email signature separator is exactly "-- " (dash dash space, nothing
213 // after). Lines that merely start with "-- " followed by other text are
214 // body content.
215 if line == "-- " {
204 break; 216 break;
205 } 217 }
206 218
@@ -369,6 +381,58 @@ Body
369 } 381 }
370 382
371 #[test] 383 #[test]
384 fn parse_subject_folded_rfc2822() {
385 // RFC 2822 header folding: continuation lines start with whitespace.
386 let patch = "\
387From abc123 Mon Sep 17 00:00:00 2001
388From: Joe <joe@example.com>
389Date: Thu, 1 Jan 1970 00:00:00 +0000
390Subject: [PATCH] fix: this is a very long commit message subject line
391 that has been folded across two lines by RFC 2822 rules
392
393Body
394";
395 let subject = extract_subject(patch).unwrap();
396 assert_eq!(
397 subject,
398 "fix: this is a very long commit message subject line that has been folded across two lines by RFC 2822 rules"
399 );
400 }
401
402 #[test]
403 fn parse_subject_mime_q_encoded() {
404 // RFC 2047 Q-encoding: =?UTF-8?q?...?=
405 let patch = "\
406From abc123 Mon Sep 17 00:00:00 2001
407From: Joe <joe@example.com>
408Date: Thu, 1 Jan 1970 00:00:00 +0000
409Subject: [PATCH] =?UTF-8?q?fix=3A_add_=E2=9C=93_check?=
410
411Body
412";
413 let subject = extract_subject(patch).unwrap();
414 // Q-decoded: "fix: add ✓ check"
415 assert_eq!(subject, "fix: add \u{2713} check");
416 }
417
418 #[test]
419 fn parse_subject_mime_b_encoded() {
420 // RFC 2047 B-encoding: =?UTF-8?b?...?= (base64)
421 // "fix: résumé" base64 encoded
422 let patch = "\
423From abc123 Mon Sep 17 00:00:00 2001
424From: Joe <joe@example.com>
425Date: Thu, 1 Jan 1970 00:00:00 +0000
426Subject: [PATCH] =?UTF-8?b?Zml4OiByw6lzdW3DqQ==?=
427
428Body
429";
430 let subject = extract_subject(patch).unwrap();
431 // B-decoded: "fix: résumé"
432 assert_eq!(subject, "fix: r\u{e9}sum\u{e9}");
433 }
434
435 #[test]
372 fn parse_body() { 436 fn parse_body() {
373 let patch = sample_patch(); 437 let patch = sample_patch();
374 let body = extract_commit_message_body(&patch).unwrap(); 438 let body = extract_commit_message_body(&patch).unwrap();
@@ -395,6 +459,48 @@ diff --git a/file.txt b/file.txt
395 } 459 }
396 460
397 #[test] 461 #[test]
462 fn parse_body_stops_at_exact_email_sig_separator() {
463 // "-- " (dash dash space, nothing after) is the email sig separator.
464 let patch = "\
465From abc123 Mon Sep 17 00:00:00 2001
466From: Joe <joe@example.com>
467Date: Thu, 1 Jan 1970 00:00:00 +0000
468Subject: [PATCH] test
469
470This is the body.
471--
472libgit2 1.9.1
473
474diff --git a/file.txt b/file.txt
475";
476 let body = extract_commit_message_body(patch).unwrap();
477 assert_eq!(body, "This is the body.");
478 }
479
480 #[test]
481 fn parse_body_does_not_stop_at_double_dash_with_text() {
482 // "-- some text" must NOT be treated as an email sig separator.
483 let patch = "\
484From abc123 Mon Sep 17 00:00:00 2001
485From: Joe <joe@example.com>
486Date: Thu, 1 Jan 1970 00:00:00 +0000
487Subject: [PATCH] test
488
489This is the body.
490-- some CLI flag description
491More body text.
492
493---
494diff --git a/file.txt b/file.txt
495";
496 let body = extract_commit_message_body(patch).unwrap();
497 assert_eq!(
498 body,
499 "This is the body.\n-- some CLI flag description\nMore body text."
500 );
501 }
502
503 #[test]
398 fn parse_full_metadata() { 504 fn parse_full_metadata() {
399 let patch = sample_patch(); 505 let patch = sample_patch();
400 let metadata = parse_mbox_patch(&patch).unwrap(); 506 let metadata = parse_mbox_patch(&patch).unwrap();
@@ -407,6 +513,7 @@ diff --git a/file.txt b/file.txt
407 assert_eq!(metadata.author_email, "joe.bloggs@pm.me"); 513 assert_eq!(metadata.author_email, "joe.bloggs@pm.me");
408 assert_eq!(metadata.author_timestamp, 0); 514 assert_eq!(metadata.author_timestamp, 0);
409 assert_eq!(metadata.author_offset_minutes, 0); 515 assert_eq!(metadata.author_offset_minutes, 0);
516 assert_eq!(metadata.committer_timestamp, None);
410 assert_eq!(metadata.subject, "add t2.md"); 517 assert_eq!(metadata.subject, "add t2.md");
411 assert_eq!( 518 assert_eq!(
412 metadata.body, 519 metadata.body,