use anyhow::{Context, Result, bail}; use chrono::DateTime; use mailparse::{MailHeaderMap, parse_headers}; #[derive(Debug, Clone, PartialEq)] pub struct PatchMetadata { pub commit_id: String, pub author_name: String, pub author_email: String, pub author_timestamp: i64, pub author_offset_minutes: i32, pub committer_timestamp: Option, pub subject: String, pub body: String, } pub fn parse_mbox_patch(content: &str) -> Result { let commit_id = extract_commit_id_from_mbox(content)?; let (author_name, author_email) = extract_author_from_from_header(content)?; let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?; let committer_timestamp = None; let subject = extract_subject(content)?; let body = extract_commit_message_body(content)?; Ok(PatchMetadata { commit_id, author_name, author_email, author_timestamp, author_offset_minutes, committer_timestamp, subject, body, }) } fn extract_commit_id_from_mbox(content: &str) -> Result { if !content.starts_with("From ") { bail!("patch does not start with 'From ' - not a valid mbox format"); } let first_line = content.lines().next().context("patch content is empty")?; let parts: Vec<&str> = first_line.split_whitespace().collect(); if parts.len() < 2 { bail!("mbox 'From ' line does not contain a commit id"); } Ok(parts[1].to_string()) } /// Extract the header section from the mbox content (everything after the first /// line up to the first blank line that ends the headers). fn extract_header_section(content: &str) -> &str { // Skip the mbox envelope line ("From "), then pass the rest // to mailparse which understands where headers end. let after_envelope = content .find('\n') .map(|pos| &content[pos + 1..]) .unwrap_or(""); // Return only up to (and including) the blank line that terminates headers, // so mailparse doesn't try to parse the diff body. let header_end = after_envelope .find("\n\n") .map(|pos| pos + 2) .unwrap_or(after_envelope.len()); &after_envelope[..header_end] } fn extract_author_from_from_header(content: &str) -> Result<(String, String)> { let header_bytes = extract_header_section(content).as_bytes(); if let Ok((headers, _)) = parse_headers(header_bytes) { if let Some(from_value) = headers.get_first_value("From") { return parse_from_header_value(&from_value); } } // Fallback: manual search let from_line = content .lines() .find(|line| line.starts_with("From:")) .context("patch does not contain a 'From:' header")?; let from_value = from_line .strip_prefix("From:") .context("failed to strip 'From:' prefix")? .trim(); parse_from_header_value(from_value) } fn parse_from_header_value(value: &str) -> Result<(String, String)> { if let Some(start) = value.find('<') { if let Some(end) = value.find('>') { let email = value[start + 1..end].to_string(); let name_part = value[..start].trim(); let name = name_part.trim_matches('"').trim().to_string(); return Ok((name, email)); } } if value.contains('@') { let email = value.trim().to_string(); let name = email.split('@').next().unwrap_or("unknown").to_string(); return Ok((name, email)); } bail!("could not parse From header: {}", value) } fn extract_date_from_header(content: &str) -> Result<(i64, i32)> { let date_line = content .lines() .find(|line| line.starts_with("Date:")) .context("patch does not contain a 'Date:' header")?; let date_value = date_line .strip_prefix("Date:") .context("failed to strip 'Date:' prefix")? .trim(); parse_rfc2822_date(date_value) } fn parse_rfc2822_date(value: &str) -> Result<(i64, i32)> { let parsed = DateTime::parse_from_rfc2822(value) .context(format!("failed to parse RFC2822 date: {}", value))?; let timestamp = parsed.timestamp(); let offset_minutes = parsed.offset().local_minus_utc() / 60; Ok((timestamp, offset_minutes)) } fn extract_subject(content: &str) -> Result { // Use mailparse to handle RFC 2047 encoded-words and RFC 2822 header folding. let header_bytes = extract_header_section(content).as_bytes(); if let Ok((headers, _)) = parse_headers(header_bytes) { if let Some(subject_value) = headers.get_first_value("Subject") { return Ok(cleanup_subject(&subject_value)); } } // Fallback: manual single-line extraction. let subject_line = content .lines() .find(|line| line.starts_with("Subject:")) .context("patch does not contain a 'Subject:' header")?; let subject_value = subject_line .strip_prefix("Subject:") .context("failed to strip 'Subject:' prefix")? .trim(); Ok(cleanup_subject(subject_value)) } fn cleanup_subject(subject: &str) -> String { let mut result = subject.to_string(); loop { let trimmed = result.trim(); if trimmed.starts_with("Re:") || trimmed.starts_with("re:") { result = trimmed[3..].trim().to_string(); continue; } if let Some(stripped) = trimmed.strip_prefix(':') { result = stripped.trim().to_string(); continue; } if trimmed.starts_with('[') { if let Some(end) = trimmed.find(']') { result = trimmed[end + 1..].trim().to_string(); continue; } } break; } result } fn extract_commit_message_body(content: &str) -> Result { let mut in_body = false; let mut body_lines: Vec = Vec::new(); let mut found_first_content = false; for line in content.lines() { if !in_body { if line.is_empty() { in_body = true; } continue; } if line.starts_with("diff --git ") || line.starts_with("Index: ") || line.starts_with("--- ") || line.starts_with("From ") { break; } if line.starts_with("---") && line.trim().eq("---") { break; } // The email signature separator is exactly "-- " (dash dash space, nothing // after). Lines that merely start with "-- " followed by other text are // body content. if line == "-- " { break; } if !found_first_content && line.trim().is_empty() { continue; } found_first_content = true; body_lines.push(line.to_string()); } while body_lines.last().is_some_and(|l| l.trim().is_empty()) { body_lines.pop(); } Ok(body_lines.join("\n").trim().to_string()) } pub fn extract_description_from_patch(content: &str) -> Result { let subject = extract_subject(content)?; let body = extract_commit_message_body(content)?; if body.is_empty() { Ok(subject) } else { Ok(format!("{}\n\n{}", subject, body)) } } #[cfg(test)] mod tests { use super::*; fn sample_patch() -> String { "\ From 431b84edc0d2fa118d63faa3c2db9c73d630a5ae Mon Sep 17 00:00:00 2001 From: Joe Bloggs Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] add t2.md This is the commit message body. It can have multiple lines. --- t2.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 t2.md diff --git a/t2.md b/t2.md new file mode 100644 index 0000000..a66525d --- /dev/null +++ b/t2.md @@ -0,0 +1 @@ +some content1 \\ No newline at end of file -- libgit2 1.9.1 " .to_string() } #[test] fn parse_commit_id() { let patch = sample_patch(); let result = extract_commit_id_from_mbox(&patch).unwrap(); assert_eq!(result, "431b84edc0d2fa118d63faa3c2db9c73d630a5ae"); } #[test] fn parse_author() { let patch = sample_patch(); let (name, email) = extract_author_from_from_header(&patch).unwrap(); assert_eq!(name, "Joe Bloggs"); assert_eq!(email, "joe.bloggs@pm.me"); } #[test] fn parse_author_with_quoted_name() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: \"John (nickname) Doe\" Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: test Body "; let (name, email) = extract_author_from_from_header(patch).unwrap(); assert_eq!(name, "John (nickname) Doe"); assert_eq!(email, "john.doe@example.com"); } #[test] fn parse_author_email_only() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: john.doe@example.com Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: test Body "; let (name, email) = extract_author_from_from_header(patch).unwrap(); assert_eq!(name, "john.doe"); assert_eq!(email, "john.doe@example.com"); } #[test] fn parse_date() { let patch = sample_patch(); let (timestamp, offset) = extract_date_from_header(&patch).unwrap(); assert_eq!(timestamp, 0); assert_eq!(offset, 0); } #[test] fn parse_date_with_timezone() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0500 Subject: test Body "; let (timestamp, offset) = extract_date_from_header(patch).unwrap(); assert_eq!(timestamp, -18000); assert_eq!(offset, 300); } #[test] fn parse_subject() { let patch = sample_patch(); let subject = extract_subject(&patch).unwrap(); assert_eq!(subject, "add t2.md"); } #[test] fn parse_subject_with_patch_prefix() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH v2 3/5] fix: important bug Body "; let subject = extract_subject(patch).unwrap(); assert_eq!(subject, "fix: important bug"); } #[test] fn parse_subject_with_re_prefix() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: Re: [PATCH] fix: important bug Body "; let subject = extract_subject(patch).unwrap(); assert_eq!(subject, "fix: important bug"); } #[test] fn parse_subject_folded_rfc2822() { // RFC 2822 header folding: continuation lines start with whitespace. let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] fix: this is a very long commit message subject line that has been folded across two lines by RFC 2822 rules Body "; let subject = extract_subject(patch).unwrap(); assert_eq!( subject, "fix: this is a very long commit message subject line that has been folded across two lines by RFC 2822 rules" ); } #[test] fn parse_subject_mime_q_encoded() { // RFC 2047 Q-encoding: =?UTF-8?q?...?= let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] =?UTF-8?q?fix=3A_add_=E2=9C=93_check?= Body "; let subject = extract_subject(patch).unwrap(); // Q-decoded: "fix: add ✓ check" assert_eq!(subject, "fix: add \u{2713} check"); } #[test] fn parse_subject_mime_b_encoded() { // RFC 2047 B-encoding: =?UTF-8?b?...?= (base64) // "fix: résumé" base64 encoded let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] =?UTF-8?b?Zml4OiByw6lzdW3DqQ==?= Body "; let subject = extract_subject(patch).unwrap(); // B-decoded: "fix: résumé" assert_eq!(subject, "fix: r\u{e9}sum\u{e9}"); } #[test] fn parse_body() { let patch = sample_patch(); let body = extract_commit_message_body(&patch).unwrap(); assert_eq!( body, "This is the commit message body.\n\nIt can have multiple lines." ); } #[test] fn parse_body_empty() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: test --- file.txt | 1 + diff --git a/file.txt b/file.txt "; let body = extract_commit_message_body(patch).unwrap(); assert_eq!(body, ""); } #[test] fn parse_body_stops_at_exact_email_sig_separator() { // "-- " (dash dash space, nothing after) is the email sig separator. let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] test This is the body. -- libgit2 1.9.1 diff --git a/file.txt b/file.txt "; let body = extract_commit_message_body(patch).unwrap(); assert_eq!(body, "This is the body."); } #[test] fn parse_body_does_not_stop_at_double_dash_with_text() { // "-- some text" must NOT be treated as an email sig separator. let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] test This is the body. -- some CLI flag description More body text. --- diff --git a/file.txt b/file.txt "; let body = extract_commit_message_body(patch).unwrap(); assert_eq!( body, "This is the body.\n-- some CLI flag description\nMore body text." ); } #[test] fn parse_full_metadata() { let patch = sample_patch(); let metadata = parse_mbox_patch(&patch).unwrap(); assert_eq!( metadata.commit_id, "431b84edc0d2fa118d63faa3c2db9c73d630a5ae" ); assert_eq!(metadata.author_name, "Joe Bloggs"); assert_eq!(metadata.author_email, "joe.bloggs@pm.me"); assert_eq!(metadata.author_timestamp, 0); assert_eq!(metadata.author_offset_minutes, 0); assert_eq!(metadata.committer_timestamp, None); assert_eq!(metadata.subject, "add t2.md"); assert_eq!( metadata.body, "This is the commit message body.\n\nIt can have multiple lines." ); } #[test] fn extract_description_combines_subject_and_body() { let patch = sample_patch(); let description = extract_description_from_patch(&patch).unwrap(); assert_eq!( description, "add t2.md\n\nThis is the commit message body.\n\nIt can have multiple lines." ); } #[test] fn extract_description_subject_only() { let patch = "\ From abc123 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 1 Jan 1970 00:00:00 +0000 Subject: [PATCH] simple fix --- file.txt | 1 + "; let description = extract_description_from_patch(patch).unwrap(); assert_eq!(description, "simple fix"); } #[test] fn cleanup_subject_strips_patch_prefixes() { assert_eq!(cleanup_subject("[PATCH] test"), "test"); assert_eq!(cleanup_subject("[PATCH v2] test"), "test"); assert_eq!(cleanup_subject("[PATCH 1/3] test"), "test"); assert_eq!(cleanup_subject("[PATCH v2 1/3] test"), "test"); assert_eq!(cleanup_subject("Re: [PATCH] test"), "test"); assert_eq!(cleanup_subject("re: test"), "test"); assert_eq!(cleanup_subject(":test"), "test"); } }