diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-07 17:13:17 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-07 17:13:17 +0000 |
| commit | b71111cc25b99acab786ece4607cb60e9cbebae4 (patch) | |
| tree | 71ef2669be50ccbdf61c34dff93a1c819867f83c /src/purgatory/sync | |
| parent | e557d07ddbb1ea1a8ca6604f9ba945f359f54ce7 (diff) | |
feat(sync): extract clone URLs from PR events in purgatory
Add support for extracting clone URLs from PR/PR-Update events (kind 1618/1619)
during purgatory sync, per NIP-34 specification. This enables fetching PR commits
from URLs specified in the PR event itself, not just from repository announcement
clone URLs.
Changes:
- Add collect_pr_clone_urls() to SyncContext trait
- Implement in RealSyncContext: extract clone tags from PR events in purgatory
- Implement in MockSyncContext: configurable PR clone URLs for testing
- Update sync_identifier_next_url to merge PR clone URLs with announcement URLs
- Update get_throttled_domains_with_untried_urls with same merge logic
- Add unit tests for PR clone URL extraction and filtering
Diffstat (limited to 'src/purgatory/sync')
| -rw-r--r-- | src/purgatory/sync/context.rs | 77 | ||||
| -rw-r--r-- | src/purgatory/sync/functions.rs | 206 |
2 files changed, 278 insertions, 5 deletions
diff --git a/src/purgatory/sync/context.rs b/src/purgatory/sync/context.rs index e97b708..2922f10 100644 --- a/src/purgatory/sync/context.rs +++ b/src/purgatory/sync/context.rs | |||
| @@ -63,6 +63,18 @@ impl ProcessResult { | |||
| 63 | /// with mocks. | 63 | /// with mocks. |
| 64 | #[async_trait] | 64 | #[async_trait] |
| 65 | pub trait SyncContext: Send + Sync { | 65 | pub trait SyncContext: Send + Sync { |
| 66 | /// Collect clone URLs from PR events in purgatory for a given identifier. | ||
| 67 | /// | ||
| 68 | /// PR events (kind 1618) and PR Update events (kind 1619) can include `clone` tags | ||
| 69 | /// specifying where the PR commits can be fetched from. This method extracts those | ||
| 70 | /// URLs to supplement the clone URLs from repository announcements. | ||
| 71 | /// | ||
| 72 | /// # Arguments | ||
| 73 | /// * `identifier` - The repository identifier | ||
| 74 | /// | ||
| 75 | /// # Returns | ||
| 76 | /// Set of clone URLs from PR events in purgatory for this identifier | ||
| 77 | fn collect_pr_clone_urls(&self, identifier: &str) -> HashSet<String>; | ||
| 66 | /// Get repository data (announcements, clone URLs, etc.) from the database. | 78 | /// Get repository data (announcements, clone URLs, etc.) from the database. |
| 67 | /// | 79 | /// |
| 68 | /// # Arguments | 80 | /// # Arguments |
| @@ -232,6 +244,30 @@ impl RealSyncContext { | |||
| 232 | 244 | ||
| 233 | #[async_trait] | 245 | #[async_trait] |
| 234 | impl SyncContext for RealSyncContext { | 246 | impl SyncContext for RealSyncContext { |
| 247 | fn collect_pr_clone_urls(&self, identifier: &str) -> HashSet<String> { | ||
| 248 | let mut urls = HashSet::new(); | ||
| 249 | |||
| 250 | for entry in self.purgatory.find_prs_for_identifier(identifier) { | ||
| 251 | if let Some(ref event) = entry.event { | ||
| 252 | for tag in event.tags.iter() { | ||
| 253 | let tag_vec = tag.clone().to_vec(); | ||
| 254 | if tag_vec.len() >= 2 && tag_vec[0] == "clone" { | ||
| 255 | // Clone tags can have multiple URLs: ["clone", "url1", "url2", ...] | ||
| 256 | urls.extend(tag_vec[1..].iter().cloned()); | ||
| 257 | } | ||
| 258 | } | ||
| 259 | } | ||
| 260 | } | ||
| 261 | |||
| 262 | debug!( | ||
| 263 | identifier = %identifier, | ||
| 264 | pr_clone_urls_count = urls.len(), | ||
| 265 | "Collected clone URLs from PR events in purgatory" | ||
| 266 | ); | ||
| 267 | |||
| 268 | urls | ||
| 269 | } | ||
| 270 | |||
| 235 | async fn fetch_repository_data(&self, identifier: &str) -> Result<RepositoryData> { | 271 | async fn fetch_repository_data(&self, identifier: &str) -> Result<RepositoryData> { |
| 236 | crate::git::authorization::fetch_repository_data(&self.database, identifier).await | 272 | crate::git::authorization::fetch_repository_data(&self.database, identifier).await |
| 237 | } | 273 | } |
| @@ -450,9 +486,12 @@ pub mod mock { | |||
| 450 | /// Repository data to return from fetch_repository_data | 486 | /// Repository data to return from fetch_repository_data |
| 451 | repo_data: RwLock<Option<RepositoryData>>, | 487 | repo_data: RwLock<Option<RepositoryData>>, |
| 452 | 488 | ||
| 453 | /// Clone URLs available for the repository | 489 | /// Clone URLs available for the repository (from announcements) |
| 454 | clone_urls: Vec<String>, | 490 | clone_urls: Vec<String>, |
| 455 | 491 | ||
| 492 | /// Clone URLs from PR events in purgatory | ||
| 493 | pr_clone_urls: HashSet<String>, | ||
| 494 | |||
| 456 | /// OIDs still needed (decremented when "fetched") | 495 | /// OIDs still needed (decremented when "fetched") |
| 457 | needed_oids: RwLock<HashSet<String>>, | 496 | needed_oids: RwLock<HashSet<String>>, |
| 458 | 497 | ||
| @@ -490,6 +529,7 @@ pub mod mock { | |||
| 490 | Self { | 529 | Self { |
| 491 | repo_data: RwLock::new(None), | 530 | repo_data: RwLock::new(None), |
| 492 | clone_urls: Vec::new(), | 531 | clone_urls: Vec::new(), |
| 532 | pr_clone_urls: HashSet::new(), | ||
| 493 | needed_oids: RwLock::new(HashSet::new()), | 533 | needed_oids: RwLock::new(HashSet::new()), |
| 494 | url_provides_oids: HashMap::new(), | 534 | url_provides_oids: HashMap::new(), |
| 495 | fetch_log: RwLock::new(Vec::new()), | 535 | fetch_log: RwLock::new(Vec::new()), |
| @@ -501,12 +541,18 @@ pub mod mock { | |||
| 501 | } | 541 | } |
| 502 | } | 542 | } |
| 503 | 543 | ||
| 504 | /// Configure clone URLs for the repository. | 544 | /// Configure clone URLs for the repository (from announcements). |
| 505 | pub fn with_urls(mut self, urls: &[&str]) -> Self { | 545 | pub fn with_urls(mut self, urls: &[&str]) -> Self { |
| 506 | self.clone_urls = urls.iter().map(|s| s.to_string()).collect(); | 546 | self.clone_urls = urls.iter().map(|s| s.to_string()).collect(); |
| 507 | self | 547 | self |
| 508 | } | 548 | } |
| 509 | 549 | ||
| 550 | /// Configure clone URLs from PR events in purgatory. | ||
| 551 | pub fn with_pr_clone_urls(mut self, urls: &[&str]) -> Self { | ||
| 552 | self.pr_clone_urls = urls.iter().map(|s| s.to_string()).collect(); | ||
| 553 | self | ||
| 554 | } | ||
| 555 | |||
| 510 | /// Configure OIDs that are still needed. | 556 | /// Configure OIDs that are still needed. |
| 511 | pub fn with_needed_oids(self, oids: &[&str]) -> Self { | 557 | pub fn with_needed_oids(self, oids: &[&str]) -> Self { |
| 512 | *self.needed_oids.write().unwrap() = oids.iter().map(|s| s.to_string()).collect(); | 558 | *self.needed_oids.write().unwrap() = oids.iter().map(|s| s.to_string()).collect(); |
| @@ -580,6 +626,10 @@ pub mod mock { | |||
| 580 | 626 | ||
| 581 | #[async_trait] | 627 | #[async_trait] |
| 582 | impl SyncContext for MockSyncContext { | 628 | impl SyncContext for MockSyncContext { |
| 629 | fn collect_pr_clone_urls(&self, _identifier: &str) -> HashSet<String> { | ||
| 630 | self.pr_clone_urls.clone() | ||
| 631 | } | ||
| 632 | |||
| 583 | async fn fetch_repository_data(&self, _identifier: &str) -> Result<RepositoryData> { | 633 | async fn fetch_repository_data(&self, _identifier: &str) -> Result<RepositoryData> { |
| 584 | // Return stored repo_data or create a minimal one with clone URLs | 634 | // Return stored repo_data or create a minimal one with clone URLs |
| 585 | if let Some(data) = self.repo_data.read().unwrap().as_ref() { | 635 | if let Some(data) = self.repo_data.read().unwrap().as_ref() { |
| @@ -791,5 +841,28 @@ pub mod mock { | |||
| 791 | mock.set_pending_events(false); | 841 | mock.set_pending_events(false); |
| 792 | assert!(!mock.has_pending_events("test-repo")); | 842 | assert!(!mock.has_pending_events("test-repo")); |
| 793 | } | 843 | } |
| 844 | |||
| 845 | #[test] | ||
| 846 | fn mock_collect_pr_clone_urls_returns_configured_urls() { | ||
| 847 | let mock = MockSyncContext::new().with_pr_clone_urls(&[ | ||
| 848 | "https://fork-server.com/repo.git", | ||
| 849 | "https://another-fork.com/repo.git", | ||
| 850 | ]); | ||
| 851 | |||
| 852 | let urls = mock.collect_pr_clone_urls("any-identifier"); | ||
| 853 | |||
| 854 | assert_eq!(urls.len(), 2); | ||
| 855 | assert!(urls.contains("https://fork-server.com/repo.git")); | ||
| 856 | assert!(urls.contains("https://another-fork.com/repo.git")); | ||
| 857 | } | ||
| 858 | |||
| 859 | #[test] | ||
| 860 | fn mock_collect_pr_clone_urls_empty_by_default() { | ||
| 861 | let mock = MockSyncContext::new(); | ||
| 862 | |||
| 863 | let urls = mock.collect_pr_clone_urls("any-identifier"); | ||
| 864 | |||
| 865 | assert!(urls.is_empty()); | ||
| 866 | } | ||
| 794 | } | 867 | } |
| 795 | } | 868 | } |
diff --git a/src/purgatory/sync/functions.rs b/src/purgatory/sync/functions.rs index 13b2e47..1b9518d 100644 --- a/src/purgatory/sync/functions.rs +++ b/src/purgatory/sync/functions.rs | |||
| @@ -110,13 +110,24 @@ pub async fn sync_identifier_next_url<C: SyncContext + ?Sized>( | |||
| 110 | } | 110 | } |
| 111 | }; | 111 | }; |
| 112 | 112 | ||
| 113 | // 4. Collect clone URLs, excluding our domain | 113 | // 4. Collect clone URLs from announcements AND PR events in purgatory |
| 114 | let our_domain = ctx.our_domain(); | 114 | let our_domain = ctx.our_domain(); |
| 115 | let all_urls: HashSet<String> = repo_data | 115 | |
| 116 | // Get clone URLs from repository announcements | ||
| 117 | let announcement_urls: HashSet<String> = repo_data | ||
| 116 | .announcements | 118 | .announcements |
| 117 | .iter() | 119 | .iter() |
| 118 | .flat_map(|a| a.clone_urls.iter().cloned()) | 120 | .flat_map(|a| a.clone_urls.iter().cloned()) |
| 121 | .collect(); | ||
| 122 | |||
| 123 | // Get clone URLs from PR events in purgatory | ||
| 124 | let pr_urls = ctx.collect_pr_clone_urls(identifier); | ||
| 125 | |||
| 126 | // Merge and filter out our domain | ||
| 127 | let all_urls: HashSet<String> = announcement_urls | ||
| 128 | .union(&pr_urls) | ||
| 119 | .filter(|url| our_domain.map_or(true, |d| !url.contains(d))) | 129 | .filter(|url| our_domain.map_or(true, |d| !url.contains(d))) |
| 130 | .cloned() | ||
| 120 | .collect(); | 131 | .collect(); |
| 121 | 132 | ||
| 122 | if all_urls.is_empty() { | 133 | if all_urls.is_empty() { |
| @@ -206,11 +217,22 @@ pub async fn get_throttled_domains_with_untried_urls<C: SyncContext + ?Sized>( | |||
| 206 | }; | 217 | }; |
| 207 | 218 | ||
| 208 | let our_domain = ctx.our_domain(); | 219 | let our_domain = ctx.our_domain(); |
| 209 | let all_urls: HashSet<String> = repo_data | 220 | |
| 221 | // Get clone URLs from repository announcements | ||
| 222 | let announcement_urls: HashSet<String> = repo_data | ||
| 210 | .announcements | 223 | .announcements |
| 211 | .iter() | 224 | .iter() |
| 212 | .flat_map(|a| a.clone_urls.iter().cloned()) | 225 | .flat_map(|a| a.clone_urls.iter().cloned()) |
| 226 | .collect(); | ||
| 227 | |||
| 228 | // Get clone URLs from PR events in purgatory | ||
| 229 | let pr_urls = ctx.collect_pr_clone_urls(identifier); | ||
| 230 | |||
| 231 | // Merge and filter out our domain | ||
| 232 | let all_urls: HashSet<String> = announcement_urls | ||
| 233 | .union(&pr_urls) | ||
| 213 | .filter(|url| our_domain.map_or(true, |d| !url.contains(d))) | 234 | .filter(|url| our_domain.map_or(true, |d| !url.contains(d))) |
| 235 | .cloned() | ||
| 214 | .collect(); | 236 | .collect(); |
| 215 | 237 | ||
| 216 | let urls_by_domain: HashMap<String, Vec<String>> = | 238 | let urls_by_domain: HashMap<String, Vec<String>> = |
| @@ -855,4 +877,182 @@ mod tests { | |||
| 855 | "Expected github.com to still be throttled" | 877 | "Expected github.com to still be throttled" |
| 856 | ); | 878 | ); |
| 857 | } | 879 | } |
| 880 | |||
| 881 | // ========================================================================= | ||
| 882 | // PR Clone URL Tests | ||
| 883 | // ========================================================================= | ||
| 884 | |||
| 885 | #[tokio::test] | ||
| 886 | async fn test_collect_pr_clone_urls_returns_configured_urls() { | ||
| 887 | // Test that MockSyncContext returns configured PR clone URLs | ||
| 888 | let mock = MockSyncContext::new() | ||
| 889 | .with_pr_clone_urls(&[ | ||
| 890 | "https://pr-server.com/fork.git", | ||
| 891 | "https://another-server.com/fork.git", | ||
| 892 | ]); | ||
| 893 | |||
| 894 | let pr_urls = mock.collect_pr_clone_urls("test-repo"); | ||
| 895 | |||
| 896 | assert_eq!(pr_urls.len(), 2); | ||
| 897 | assert!(pr_urls.contains("https://pr-server.com/fork.git")); | ||
| 898 | assert!(pr_urls.contains("https://another-server.com/fork.git")); | ||
| 899 | } | ||
| 900 | |||
| 901 | #[tokio::test] | ||
| 902 | async fn test_sync_identifier_next_url_includes_pr_clone_urls() { | ||
| 903 | // Set up mock with announcement URLs and PR clone URLs | ||
| 904 | let mock = MockSyncContext::new() | ||
| 905 | .with_urls(&["https://github.com/owner/repo.git"]) // From announcement | ||
| 906 | .with_pr_clone_urls(&["https://pr-author.com/fork.git"]) // From PR event | ||
| 907 | .with_needed_oids(&["abc123"]) | ||
| 908 | .with_pending_events(true); | ||
| 909 | |||
| 910 | let throttle_manager = ThrottleManager::new(5, 100); | ||
| 911 | let tried_urls = HashSet::new(); | ||
| 912 | |||
| 913 | // Get first URL | ||
| 914 | let first_url = | ||
| 915 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | ||
| 916 | .await | ||
| 917 | .expect("Should return a URL"); | ||
| 918 | |||
| 919 | // Try the first URL | ||
| 920 | let mut tried = HashSet::new(); | ||
| 921 | tried.insert(first_url.clone()); | ||
| 922 | |||
| 923 | // Get second URL | ||
| 924 | let second_url = | ||
| 925 | sync_identifier_next_url(&mock, "test-repo", None, &tried, &throttle_manager) | ||
| 926 | .await | ||
| 927 | .expect("Should return a second URL"); | ||
| 928 | |||
| 929 | // Both URLs should be available (one from announcement, one from PR) | ||
| 930 | let both_urls = vec![first_url, second_url]; | ||
| 931 | assert!( | ||
| 932 | both_urls.iter().any(|u| u.contains("github.com")), | ||
| 933 | "Should include announcement URL" | ||
| 934 | ); | ||
| 935 | assert!( | ||
| 936 | both_urls.iter().any(|u| u.contains("pr-author.com")), | ||
| 937 | "Should include PR clone URL" | ||
| 938 | ); | ||
| 939 | } | ||
| 940 | |||
| 941 | #[tokio::test] | ||
| 942 | async fn test_pr_clone_urls_filtered_by_our_domain() { | ||
| 943 | // Set up mock with PR clone URL pointing to our domain | ||
| 944 | let mock = MockSyncContext::new() | ||
| 945 | .with_urls(&["https://github.com/owner/repo.git"]) | ||
| 946 | .with_pr_clone_urls(&[ | ||
| 947 | "https://our-relay.com/fork.git", // Should be filtered | ||
| 948 | "https://external.com/fork.git", // Should be included | ||
| 949 | ]) | ||
| 950 | .with_our_domain("our-relay.com") | ||
| 951 | .with_needed_oids(&["abc123"]) | ||
| 952 | .with_pending_events(true); | ||
| 953 | |||
| 954 | let throttle_manager = ThrottleManager::new(5, 100); | ||
| 955 | let mut tried_urls = HashSet::new(); | ||
| 956 | |||
| 957 | // Collect all available URLs | ||
| 958 | let mut available_urls = Vec::new(); | ||
| 959 | while let Some(url) = | ||
| 960 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | ||
| 961 | .await | ||
| 962 | { | ||
| 963 | available_urls.push(url.clone()); | ||
| 964 | tried_urls.insert(url); | ||
| 965 | } | ||
| 966 | |||
| 967 | // Should have 2 URLs (github.com and external.com), not 3 | ||
| 968 | assert_eq!( | ||
| 969 | available_urls.len(), | ||
| 970 | 2, | ||
| 971 | "Expected 2 URLs after filtering our domain, got: {:?}", | ||
| 972 | available_urls | ||
| 973 | ); | ||
| 974 | |||
| 975 | // our-relay.com should be filtered out | ||
| 976 | assert!( | ||
| 977 | !available_urls.iter().any(|u| u.contains("our-relay.com")), | ||
| 978 | "Our domain should be filtered out" | ||
| 979 | ); | ||
| 980 | |||
| 981 | // github.com and external.com should be present | ||
| 982 | assert!( | ||
| 983 | available_urls.iter().any(|u| u.contains("github.com")), | ||
| 984 | "github.com should be present" | ||
| 985 | ); | ||
| 986 | assert!( | ||
| 987 | available_urls.iter().any(|u| u.contains("external.com")), | ||
| 988 | "external.com should be present" | ||
| 989 | ); | ||
| 990 | } | ||
| 991 | |||
| 992 | #[tokio::test] | ||
| 993 | async fn test_get_throttled_domains_includes_pr_clone_urls() { | ||
| 994 | // Set up mock with throttled PR clone URL domain | ||
| 995 | let mock = MockSyncContext::new() | ||
| 996 | .with_urls(&["https://github.com/owner/repo.git"]) | ||
| 997 | .with_pr_clone_urls(&["https://pr-server.com/fork.git"]) | ||
| 998 | .with_needed_oids(&["abc123"]) | ||
| 999 | .with_pending_events(true); | ||
| 1000 | |||
| 1001 | let throttle_manager = ThrottleManager::new(1, 100); | ||
| 1002 | |||
| 1003 | // Throttle both domains | ||
| 1004 | throttle_manager.start_request("github.com"); | ||
| 1005 | throttle_manager.start_request("pr-server.com"); | ||
| 1006 | |||
| 1007 | let tried_urls = HashSet::new(); | ||
| 1008 | |||
| 1009 | let throttled = | ||
| 1010 | get_throttled_domains_with_untried_urls(&mock, "test-repo", &tried_urls, &throttle_manager) | ||
| 1011 | .await; | ||
| 1012 | |||
| 1013 | // Should include both throttled domains | ||
| 1014 | let domains: Vec<&str> = throttled.iter().map(|t| t.domain.as_str()).collect(); | ||
| 1015 | assert!( | ||
| 1016 | domains.contains(&"github.com"), | ||
| 1017 | "Should include github.com" | ||
| 1018 | ); | ||
| 1019 | assert!( | ||
| 1020 | domains.contains(&"pr-server.com"), | ||
| 1021 | "Should include pr-server.com from PR clone URLs" | ||
| 1022 | ); | ||
| 1023 | } | ||
| 1024 | |||
| 1025 | #[tokio::test] | ||
| 1026 | async fn test_sync_identifier_uses_pr_clone_urls_when_announcement_urls_fail() { | ||
| 1027 | // Set up mock where only PR clone URL can provide the needed OID | ||
| 1028 | let mock = MockSyncContext::new() | ||
| 1029 | .with_urls(&["https://github.com/owner/repo.git"]) // Doesn't have the OID | ||
| 1030 | .with_pr_clone_urls(&["https://pr-author.com/fork.git"]) // Has the OID | ||
| 1031 | .with_needed_oids(&["pr-commit-123"]) | ||
| 1032 | .with_pending_events(true) | ||
| 1033 | .url_provides("https://pr-author.com/fork.git", &["pr-commit-123"]); | ||
| 1034 | // Note: github.com doesn't provide any OIDs | ||
| 1035 | |||
| 1036 | let throttle_manager = Arc::new(ThrottleManager::new(5, 100)); | ||
| 1037 | |||
| 1038 | // Run sync_identifier | ||
| 1039 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager).await; | ||
| 1040 | |||
| 1041 | // Should complete successfully using PR clone URL | ||
| 1042 | assert!(complete, "Sync should complete using PR clone URL"); | ||
| 1043 | |||
| 1044 | // Verify PR clone URL was tried | ||
| 1045 | let fetch_log = mock.fetch_log(); | ||
| 1046 | assert!( | ||
| 1047 | fetch_log.iter().any(|u| u.contains("pr-author.com")), | ||
| 1048 | "PR clone URL should have been tried: {:?}", | ||
| 1049 | fetch_log | ||
| 1050 | ); | ||
| 1051 | |||
| 1052 | // OID should be fetched | ||
| 1053 | assert!( | ||
| 1054 | mock.current_needed_oids().is_empty(), | ||
| 1055 | "OID should be fetched from PR clone URL" | ||
| 1056 | ); | ||
| 1057 | } | ||
| 858 | } | 1058 | } |