diff options
| -rw-r--r-- | Cargo.lock | 28 | ||||
| -rw-r--r-- | Cargo.toml | 1 | ||||
| -rw-r--r-- | src/lib/mbox_parser.rs | 163 |
3 files changed, 164 insertions, 28 deletions
| @@ -360,6 +360,16 @@ dependencies = [ | |||
| 360 | ] | 360 | ] |
| 361 | 361 | ||
| 362 | [[package]] | 362 | [[package]] |
| 363 | name = "charset" | ||
| 364 | version = "0.1.5" | ||
| 365 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 366 | checksum = "f1f927b07c74ba84c7e5fe4db2baeb3e996ab2688992e39ac68ce3220a677c7e" | ||
| 367 | dependencies = [ | ||
| 368 | "base64", | ||
| 369 | "encoding_rs", | ||
| 370 | ] | ||
| 371 | |||
| 372 | [[package]] | ||
| 363 | name = "chrono" | 373 | name = "chrono" |
| 364 | version = "0.4.43" | 374 | version = "0.4.43" |
| 365 | source = "registry+https://github.com/rust-lang/crates.io-index" | 375 | source = "registry+https://github.com/rust-lang/crates.io-index" |
| @@ -1430,6 +1440,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||
| 1430 | checksum = "96051b46fc183dc9cd4a223960ef37b9af631b55191852a8274bfef064cda20f" | 1440 | checksum = "96051b46fc183dc9cd4a223960ef37b9af631b55191852a8274bfef064cda20f" |
| 1431 | 1441 | ||
| 1432 | [[package]] | 1442 | [[package]] |
| 1443 | name = "mailparse" | ||
| 1444 | version = "0.16.1" | ||
| 1445 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 1446 | checksum = "60819a97ddcb831a5614eb3b0174f3620e793e97e09195a395bfa948fd68ed2f" | ||
| 1447 | dependencies = [ | ||
| 1448 | "charset", | ||
| 1449 | "data-encoding", | ||
| 1450 | "quoted_printable", | ||
| 1451 | ] | ||
| 1452 | |||
| 1453 | [[package]] | ||
| 1433 | name = "memchr" | 1454 | name = "memchr" |
| 1434 | version = "2.7.6" | 1455 | version = "2.7.6" |
| 1435 | source = "registry+https://github.com/rust-lang/crates.io-index" | 1456 | source = "registry+https://github.com/rust-lang/crates.io-index" |
| @@ -1527,6 +1548,7 @@ dependencies = [ | |||
| 1527 | "futures", | 1548 | "futures", |
| 1528 | "git2", | 1549 | "git2", |
| 1529 | "indicatif", | 1550 | "indicatif", |
| 1551 | "mailparse", | ||
| 1530 | "mockall", | 1552 | "mockall", |
| 1531 | "nostr", | 1553 | "nostr", |
| 1532 | "nostr-connect", | 1554 | "nostr-connect", |
| @@ -2006,6 +2028,12 @@ dependencies = [ | |||
| 2006 | ] | 2028 | ] |
| 2007 | 2029 | ||
| 2008 | [[package]] | 2030 | [[package]] |
| 2031 | name = "quoted_printable" | ||
| 2032 | version = "0.5.1" | ||
| 2033 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| 2034 | checksum = "640c9bd8497b02465aeef5375144c26062e0dcd5939dfcbb0f5db76cb8c17c73" | ||
| 2035 | |||
| 2036 | [[package]] | ||
| 2009 | name = "r-efi" | 2037 | name = "r-efi" |
| 2010 | version = "5.3.0" | 2038 | version = "5.3.0" |
| 2011 | source = "registry+https://github.com/rust-lang/crates.io-index" | 2039 | source = "registry+https://github.com/rust-lang/crates.io-index" |
| @@ -24,6 +24,7 @@ directories = "6.0.0" | |||
| 24 | futures = "0.3.31" | 24 | futures = "0.3.31" |
| 25 | git2 = "0.20.2" | 25 | git2 = "0.20.2" |
| 26 | indicatif = "0.18.0" | 26 | indicatif = "0.18.0" |
| 27 | mailparse = "0.16.1" | ||
| 27 | nostr = { version = "0.44.1", features = ["nip49"] } | 28 | nostr = { version = "0.44.1", features = ["nip49"] } |
| 28 | nostr-connect = "0.44.0" | 29 | nostr-connect = "0.44.0" |
| 29 | nostr-database = "0.44.0" | 30 | nostr-database = "0.44.0" |
diff --git a/src/lib/mbox_parser.rs b/src/lib/mbox_parser.rs index 40603b1..fd2f8ed 100644 --- a/src/lib/mbox_parser.rs +++ b/src/lib/mbox_parser.rs | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | use anyhow::{Context, Result, bail}; | 1 | use anyhow::{Context, Result, bail}; |
| 2 | use chrono::{DateTime, Datelike}; | 2 | use chrono::DateTime; |
| 3 | use mailparse::{MailHeaderMap, parse_headers}; | ||
| 3 | 4 | ||
| 4 | #[derive(Debug, Clone, PartialEq)] | 5 | #[derive(Debug, Clone, PartialEq)] |
| 5 | pub struct PatchMetadata { | 6 | pub struct PatchMetadata { |
| @@ -17,7 +18,7 @@ pub fn parse_mbox_patch(content: &str) -> Result<PatchMetadata> { | |||
| 17 | let commit_id = extract_commit_id_from_mbox(content)?; | 18 | let commit_id = extract_commit_id_from_mbox(content)?; |
| 18 | let (author_name, author_email) = extract_author_from_from_header(content)?; | 19 | let (author_name, author_email) = extract_author_from_from_header(content)?; |
| 19 | let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?; | 20 | let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?; |
| 20 | let committer_timestamp = extract_committer_date_from_mbox(content)?; | 21 | let committer_timestamp = None; |
| 21 | let subject = extract_subject(content)?; | 22 | let subject = extract_subject(content)?; |
| 22 | let body = extract_commit_message_body(content)?; | 23 | let body = extract_commit_message_body(content)?; |
| 23 | 24 | ||
| @@ -48,7 +49,33 @@ fn extract_commit_id_from_mbox(content: &str) -> Result<String> { | |||
| 48 | Ok(parts[1].to_string()) | 49 | Ok(parts[1].to_string()) |
| 49 | } | 50 | } |
| 50 | 51 | ||
| 52 | /// Extract the header section from the mbox content (everything after the first | ||
| 53 | /// line up to the first blank line that ends the headers). | ||
| 54 | fn extract_header_section(content: &str) -> &str { | ||
| 55 | // Skip the mbox envelope line ("From <sha> <date>"), then pass the rest | ||
| 56 | // to mailparse which understands where headers end. | ||
| 57 | let after_envelope = content | ||
| 58 | .find('\n') | ||
| 59 | .map(|pos| &content[pos + 1..]) | ||
| 60 | .unwrap_or(""); | ||
| 61 | // Return only up to (and including) the blank line that terminates headers, | ||
| 62 | // so mailparse doesn't try to parse the diff body. | ||
| 63 | let header_end = after_envelope | ||
| 64 | .find("\n\n") | ||
| 65 | .map(|pos| pos + 2) | ||
| 66 | .unwrap_or(after_envelope.len()); | ||
| 67 | &after_envelope[..header_end] | ||
| 68 | } | ||
| 69 | |||
| 51 | fn extract_author_from_from_header(content: &str) -> Result<(String, String)> { | 70 | fn extract_author_from_from_header(content: &str) -> Result<(String, String)> { |
| 71 | let header_bytes = extract_header_section(content).as_bytes(); | ||
| 72 | if let Ok((headers, _)) = parse_headers(header_bytes) { | ||
| 73 | if let Some(from_value) = headers.get_first_value("From") { | ||
| 74 | return parse_from_header_value(&from_value); | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | // Fallback: manual search | ||
| 52 | let from_line = content | 79 | let from_line = content |
| 53 | .lines() | 80 | .lines() |
| 54 | .find(|line| line.starts_with("From:")) | 81 | .find(|line| line.starts_with("From:")) |
| @@ -105,34 +132,16 @@ fn parse_rfc2822_date(value: &str) -> Result<(i64, i32)> { | |||
| 105 | Ok((timestamp, offset_minutes)) | 132 | Ok((timestamp, offset_minutes)) |
| 106 | } | 133 | } |
| 107 | 134 | ||
| 108 | fn extract_committer_date_from_mbox(content: &str) -> Result<Option<i64>> { | 135 | fn extract_subject(content: &str) -> Result<String> { |
| 109 | let first_line = content.lines().next().context("patch content is empty")?; | 136 | // Use mailparse to handle RFC 2047 encoded-words and RFC 2822 header folding. |
| 110 | 137 | let header_bytes = extract_header_section(content).as_bytes(); | |
| 111 | let parts: Vec<&str> = first_line.split_whitespace().collect(); | 138 | if let Ok((headers, _)) = parse_headers(header_bytes) { |
| 112 | 139 | if let Some(subject_value) = headers.get_first_value("Subject") { | |
| 113 | if parts.len() >= 6 { | 140 | return Ok(cleanup_subject(&subject_value)); |
| 114 | let date_str = parts[3..6].join(" "); | ||
| 115 | if let Ok(dt) = DateTime::parse_from_rfc2822(&date_str) { | ||
| 116 | return Ok(Some(dt.timestamp())); | ||
| 117 | } | ||
| 118 | } | ||
| 119 | |||
| 120 | if parts.len() >= 7 { | ||
| 121 | let date_str = format!("{} {} {}", parts[3], parts[4], parts[5]); | ||
| 122 | if let Ok(dt) = chrono::DateTime::parse_from_str(&date_str, "%a %b %d") { | ||
| 123 | if let Ok(year) = parts[6].parse::<i32>() { | ||
| 124 | let with_year = dt.with_year(year); | ||
| 125 | if let Some(dt_with_year) = with_year { | ||
| 126 | return Ok(Some(dt_with_year.timestamp())); | ||
| 127 | } | ||
| 128 | } | ||
| 129 | } | 141 | } |
| 130 | } | 142 | } |
| 131 | 143 | ||
| 132 | Ok(None) | 144 | // Fallback: manual single-line extraction. |
| 133 | } | ||
| 134 | |||
| 135 | fn extract_subject(content: &str) -> Result<String> { | ||
| 136 | let subject_line = content | 145 | let subject_line = content |
| 137 | .lines() | 146 | .lines() |
| 138 | .find(|line| line.starts_with("Subject:")) | 147 | .find(|line| line.starts_with("Subject:")) |
| @@ -200,7 +209,10 @@ fn extract_commit_message_body(content: &str) -> Result<String> { | |||
| 200 | break; | 209 | break; |
| 201 | } | 210 | } |
| 202 | 211 | ||
| 203 | if line.starts_with("-- ") || line.starts_with("--\n") { | 212 | // The email signature separator is exactly "-- " (dash dash space, nothing |
| 213 | // after). Lines that merely start with "-- " followed by other text are | ||
| 214 | // body content. | ||
| 215 | if line == "-- " { | ||
| 204 | break; | 216 | break; |
| 205 | } | 217 | } |
| 206 | 218 | ||
| @@ -369,6 +381,58 @@ Body | |||
| 369 | } | 381 | } |
| 370 | 382 | ||
| 371 | #[test] | 383 | #[test] |
| 384 | fn parse_subject_folded_rfc2822() { | ||
| 385 | // RFC 2822 header folding: continuation lines start with whitespace. | ||
| 386 | let patch = "\ | ||
| 387 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 388 | From: Joe <joe@example.com> | ||
| 389 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 390 | Subject: [PATCH] fix: this is a very long commit message subject line | ||
| 391 | that has been folded across two lines by RFC 2822 rules | ||
| 392 | |||
| 393 | Body | ||
| 394 | "; | ||
| 395 | let subject = extract_subject(patch).unwrap(); | ||
| 396 | assert_eq!( | ||
| 397 | subject, | ||
| 398 | "fix: this is a very long commit message subject line that has been folded across two lines by RFC 2822 rules" | ||
| 399 | ); | ||
| 400 | } | ||
| 401 | |||
| 402 | #[test] | ||
| 403 | fn parse_subject_mime_q_encoded() { | ||
| 404 | // RFC 2047 Q-encoding: =?UTF-8?q?...?= | ||
| 405 | let patch = "\ | ||
| 406 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 407 | From: Joe <joe@example.com> | ||
| 408 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 409 | Subject: [PATCH] =?UTF-8?q?fix=3A_add_=E2=9C=93_check?= | ||
| 410 | |||
| 411 | Body | ||
| 412 | "; | ||
| 413 | let subject = extract_subject(patch).unwrap(); | ||
| 414 | // Q-decoded: "fix: add ✓ check" | ||
| 415 | assert_eq!(subject, "fix: add \u{2713} check"); | ||
| 416 | } | ||
| 417 | |||
| 418 | #[test] | ||
| 419 | fn parse_subject_mime_b_encoded() { | ||
| 420 | // RFC 2047 B-encoding: =?UTF-8?b?...?= (base64) | ||
| 421 | // "fix: résumé" base64 encoded | ||
| 422 | let patch = "\ | ||
| 423 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 424 | From: Joe <joe@example.com> | ||
| 425 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 426 | Subject: [PATCH] =?UTF-8?b?Zml4OiByw6lzdW3DqQ==?= | ||
| 427 | |||
| 428 | Body | ||
| 429 | "; | ||
| 430 | let subject = extract_subject(patch).unwrap(); | ||
| 431 | // B-decoded: "fix: résumé" | ||
| 432 | assert_eq!(subject, "fix: r\u{e9}sum\u{e9}"); | ||
| 433 | } | ||
| 434 | |||
| 435 | #[test] | ||
| 372 | fn parse_body() { | 436 | fn parse_body() { |
| 373 | let patch = sample_patch(); | 437 | let patch = sample_patch(); |
| 374 | let body = extract_commit_message_body(&patch).unwrap(); | 438 | let body = extract_commit_message_body(&patch).unwrap(); |
| @@ -395,6 +459,48 @@ diff --git a/file.txt b/file.txt | |||
| 395 | } | 459 | } |
| 396 | 460 | ||
| 397 | #[test] | 461 | #[test] |
| 462 | fn parse_body_stops_at_exact_email_sig_separator() { | ||
| 463 | // "-- " (dash dash space, nothing after) is the email sig separator. | ||
| 464 | let patch = "\ | ||
| 465 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 466 | From: Joe <joe@example.com> | ||
| 467 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 468 | Subject: [PATCH] test | ||
| 469 | |||
| 470 | This is the body. | ||
| 471 | -- | ||
| 472 | libgit2 1.9.1 | ||
| 473 | |||
| 474 | diff --git a/file.txt b/file.txt | ||
| 475 | "; | ||
| 476 | let body = extract_commit_message_body(patch).unwrap(); | ||
| 477 | assert_eq!(body, "This is the body."); | ||
| 478 | } | ||
| 479 | |||
| 480 | #[test] | ||
| 481 | fn parse_body_does_not_stop_at_double_dash_with_text() { | ||
| 482 | // "-- some text" must NOT be treated as an email sig separator. | ||
| 483 | let patch = "\ | ||
| 484 | From abc123 Mon Sep 17 00:00:00 2001 | ||
| 485 | From: Joe <joe@example.com> | ||
| 486 | Date: Thu, 1 Jan 1970 00:00:00 +0000 | ||
| 487 | Subject: [PATCH] test | ||
| 488 | |||
| 489 | This is the body. | ||
| 490 | -- some CLI flag description | ||
| 491 | More body text. | ||
| 492 | |||
| 493 | --- | ||
| 494 | diff --git a/file.txt b/file.txt | ||
| 495 | "; | ||
| 496 | let body = extract_commit_message_body(patch).unwrap(); | ||
| 497 | assert_eq!( | ||
| 498 | body, | ||
| 499 | "This is the body.\n-- some CLI flag description\nMore body text." | ||
| 500 | ); | ||
| 501 | } | ||
| 502 | |||
| 503 | #[test] | ||
| 398 | fn parse_full_metadata() { | 504 | fn parse_full_metadata() { |
| 399 | let patch = sample_patch(); | 505 | let patch = sample_patch(); |
| 400 | let metadata = parse_mbox_patch(&patch).unwrap(); | 506 | let metadata = parse_mbox_patch(&patch).unwrap(); |
| @@ -407,6 +513,7 @@ diff --git a/file.txt b/file.txt | |||
| 407 | assert_eq!(metadata.author_email, "joe.bloggs@pm.me"); | 513 | assert_eq!(metadata.author_email, "joe.bloggs@pm.me"); |
| 408 | assert_eq!(metadata.author_timestamp, 0); | 514 | assert_eq!(metadata.author_timestamp, 0); |
| 409 | assert_eq!(metadata.author_offset_minutes, 0); | 515 | assert_eq!(metadata.author_offset_minutes, 0); |
| 516 | assert_eq!(metadata.committer_timestamp, None); | ||
| 410 | assert_eq!(metadata.subject, "add t2.md"); | 517 | assert_eq!(metadata.subject, "add t2.md"); |
| 411 | assert_eq!( | 518 | assert_eq!( |
| 412 | metadata.body, | 519 | metadata.body, |