diff options
Diffstat (limited to 'src/purgatory/sync/functions.rs')
| -rw-r--r-- | src/purgatory/sync/functions.rs | 69 |
1 files changed, 36 insertions, 33 deletions
diff --git a/src/purgatory/sync/functions.rs b/src/purgatory/sync/functions.rs index bb7c0b9..370990e 100644 --- a/src/purgatory/sync/functions.rs +++ b/src/purgatory/sync/functions.rs | |||
| @@ -32,15 +32,17 @@ use super::throttle::ThrottleManager; | |||
| 32 | fn extract_domain(url: &str) -> Option<String> { | 32 | fn extract_domain(url: &str) -> Option<String> { |
| 33 | // Simple URL parsing for HTTP(S) URLs | 33 | // Simple URL parsing for HTTP(S) URLs |
| 34 | // Format: scheme://[user@]host[:port]/path | 34 | // Format: scheme://[user@]host[:port]/path |
| 35 | let url = url.strip_prefix("https://").or_else(|| url.strip_prefix("http://"))?; | 35 | let url = url |
| 36 | 36 | .strip_prefix("https://") | |
| 37 | .or_else(|| url.strip_prefix("http://"))?; | ||
| 38 | |||
| 37 | // Remove user info if present (e.g., "user@host" -> "host") | 39 | // Remove user info if present (e.g., "user@host" -> "host") |
| 38 | let url = url.split('@').next_back()?; | 40 | let url = url.split('@').next_back()?; |
| 39 | 41 | ||
| 40 | // Extract host (before first '/' or ':') | 42 | // Extract host (before first '/' or ':') |
| 41 | let host = url.split('/').next()?; | 43 | let host = url.split('/').next()?; |
| 42 | let host = host.split(':').next()?; | 44 | let host = host.split(':').next()?; |
| 43 | 45 | ||
| 44 | if host.is_empty() { | 46 | if host.is_empty() { |
| 45 | None | 47 | None |
| 46 | } else { | 48 | } else { |
| @@ -112,17 +114,17 @@ pub async fn sync_identifier_next_url<C: SyncContext + ?Sized>( | |||
| 112 | 114 | ||
| 113 | // 4. Collect clone URLs from announcements AND PR events in purgatory | 115 | // 4. Collect clone URLs from announcements AND PR events in purgatory |
| 114 | let our_domain = ctx.our_domain(); | 116 | let our_domain = ctx.our_domain(); |
| 115 | 117 | ||
| 116 | // Get clone URLs from repository announcements | 118 | // Get clone URLs from repository announcements |
| 117 | let announcement_urls: HashSet<String> = repo_data | 119 | let announcement_urls: HashSet<String> = repo_data |
| 118 | .announcements | 120 | .announcements |
| 119 | .iter() | 121 | .iter() |
| 120 | .flat_map(|a| a.clone_urls.iter().cloned()) | 122 | .flat_map(|a| a.clone_urls.iter().cloned()) |
| 121 | .collect(); | 123 | .collect(); |
| 122 | 124 | ||
| 123 | // Get clone URLs from PR events in purgatory | 125 | // Get clone URLs from PR events in purgatory |
| 124 | let pr_urls = ctx.collect_pr_clone_urls(identifier); | 126 | let pr_urls = ctx.collect_pr_clone_urls(identifier); |
| 125 | 127 | ||
| 126 | // Merge and filter out our domain | 128 | // Merge and filter out our domain |
| 127 | let all_urls: HashSet<String> = announcement_urls | 129 | let all_urls: HashSet<String> = announcement_urls |
| 128 | .union(&pr_urls) | 130 | .union(&pr_urls) |
| @@ -151,11 +153,9 @@ pub async fn sync_identifier_next_url<C: SyncContext + ?Sized>( | |||
| 151 | match domain { | 153 | match domain { |
| 152 | Some(specific_domain) => { | 154 | Some(specific_domain) => { |
| 153 | // Only look at URLs from this specific domain | 155 | // Only look at URLs from this specific domain |
| 154 | urls_by_domain.get(specific_domain).and_then(|urls| { | 156 | urls_by_domain |
| 155 | urls.iter() | 157 | .get(specific_domain) |
| 156 | .find(|url| !tried_urls.contains(*url)) | 158 | .and_then(|urls| urls.iter().find(|url| !tried_urls.contains(*url)).cloned()) |
| 157 | .cloned() | ||
| 158 | }) | ||
| 159 | } | 159 | } |
| 160 | None => { | 160 | None => { |
| 161 | // Try any non-throttled domain | 161 | // Try any non-throttled domain |
| @@ -217,17 +217,17 @@ pub async fn get_throttled_domains_with_untried_urls<C: SyncContext + ?Sized>( | |||
| 217 | }; | 217 | }; |
| 218 | 218 | ||
| 219 | let our_domain = ctx.our_domain(); | 219 | let our_domain = ctx.our_domain(); |
| 220 | 220 | ||
| 221 | // Get clone URLs from repository announcements | 221 | // Get clone URLs from repository announcements |
| 222 | let announcement_urls: HashSet<String> = repo_data | 222 | let announcement_urls: HashSet<String> = repo_data |
| 223 | .announcements | 223 | .announcements |
| 224 | .iter() | 224 | .iter() |
| 225 | .flat_map(|a| a.clone_urls.iter().cloned()) | 225 | .flat_map(|a| a.clone_urls.iter().cloned()) |
| 226 | .collect(); | 226 | .collect(); |
| 227 | 227 | ||
| 228 | // Get clone URLs from PR events in purgatory | 228 | // Get clone URLs from PR events in purgatory |
| 229 | let pr_urls = ctx.collect_pr_clone_urls(identifier); | 229 | let pr_urls = ctx.collect_pr_clone_urls(identifier); |
| 230 | 230 | ||
| 231 | // Merge and filter out our domain | 231 | // Merge and filter out our domain |
| 232 | let all_urls: HashSet<String> = announcement_urls | 232 | let all_urls: HashSet<String> = announcement_urls |
| 233 | .union(&pr_urls) | 233 | .union(&pr_urls) |
| @@ -766,9 +766,13 @@ mod tests { | |||
| 766 | let mut tried_urls = HashSet::new(); | 766 | let mut tried_urls = HashSet::new(); |
| 767 | tried_urls.insert("https://github.com/foo/bar.git".to_string()); | 767 | tried_urls.insert("https://github.com/foo/bar.git".to_string()); |
| 768 | 768 | ||
| 769 | let throttled = | 769 | let throttled = get_throttled_domains_with_untried_urls( |
| 770 | get_throttled_domains_with_untried_urls(&mock, "test-repo", &tried_urls, &throttle_manager) | 770 | &mock, |
| 771 | .await; | 771 | "test-repo", |
| 772 | &tried_urls, | ||
| 773 | &throttle_manager, | ||
| 774 | ) | ||
| 775 | .await; | ||
| 772 | 776 | ||
| 773 | // Should only include gitlab.com (throttled with untried URLs) | 777 | // Should only include gitlab.com (throttled with untried URLs) |
| 774 | // github.com is throttled but URL was tried | 778 | // github.com is throttled but URL was tried |
| @@ -885,11 +889,10 @@ mod tests { | |||
| 885 | #[tokio::test] | 889 | #[tokio::test] |
| 886 | async fn test_collect_pr_clone_urls_returns_configured_urls() { | 890 | async fn test_collect_pr_clone_urls_returns_configured_urls() { |
| 887 | // Test that MockSyncContext returns configured PR clone URLs | 891 | // Test that MockSyncContext returns configured PR clone URLs |
| 888 | let mock = MockSyncContext::new() | 892 | let mock = MockSyncContext::new().with_pr_clone_urls(&[ |
| 889 | .with_pr_clone_urls(&[ | 893 | "https://pr-server.com/fork.git", |
| 890 | "https://pr-server.com/fork.git", | 894 | "https://another-server.com/fork.git", |
| 891 | "https://another-server.com/fork.git", | 895 | ]); |
| 892 | ]); | ||
| 893 | 896 | ||
| 894 | let pr_urls = mock.collect_pr_clone_urls("test-repo"); | 897 | let pr_urls = mock.collect_pr_clone_urls("test-repo"); |
| 895 | 898 | ||
| @@ -945,7 +948,7 @@ mod tests { | |||
| 945 | .with_urls(&["https://github.com/owner/repo.git"]) | 948 | .with_urls(&["https://github.com/owner/repo.git"]) |
| 946 | .with_pr_clone_urls(&[ | 949 | .with_pr_clone_urls(&[ |
| 947 | "https://our-relay.com/fork.git", // Should be filtered | 950 | "https://our-relay.com/fork.git", // Should be filtered |
| 948 | "https://external.com/fork.git", // Should be included | 951 | "https://external.com/fork.git", // Should be included |
| 949 | ]) | 952 | ]) |
| 950 | .with_our_domain("our-relay.com") | 953 | .with_our_domain("our-relay.com") |
| 951 | .with_needed_oids(&["abc123"]) | 954 | .with_needed_oids(&["abc123"]) |
| @@ -957,8 +960,7 @@ mod tests { | |||
| 957 | // Collect all available URLs | 960 | // Collect all available URLs |
| 958 | let mut available_urls = Vec::new(); | 961 | let mut available_urls = Vec::new(); |
| 959 | while let Some(url) = | 962 | while let Some(url) = |
| 960 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 963 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager).await |
| 961 | .await | ||
| 962 | { | 964 | { |
| 963 | available_urls.push(url.clone()); | 965 | available_urls.push(url.clone()); |
| 964 | tried_urls.insert(url); | 966 | tried_urls.insert(url); |
| @@ -1006,16 +1008,17 @@ mod tests { | |||
| 1006 | 1008 | ||
| 1007 | let tried_urls = HashSet::new(); | 1009 | let tried_urls = HashSet::new(); |
| 1008 | 1010 | ||
| 1009 | let throttled = | 1011 | let throttled = get_throttled_domains_with_untried_urls( |
| 1010 | get_throttled_domains_with_untried_urls(&mock, "test-repo", &tried_urls, &throttle_manager) | 1012 | &mock, |
| 1011 | .await; | 1013 | "test-repo", |
| 1014 | &tried_urls, | ||
| 1015 | &throttle_manager, | ||
| 1016 | ) | ||
| 1017 | .await; | ||
| 1012 | 1018 | ||
| 1013 | // Should include both throttled domains | 1019 | // Should include both throttled domains |
| 1014 | let domains: Vec<&str> = throttled.iter().map(|t| t.domain.as_str()).collect(); | 1020 | let domains: Vec<&str> = throttled.iter().map(|t| t.domain.as_str()).collect(); |
| 1015 | assert!( | 1021 | assert!(domains.contains(&"github.com"), "Should include github.com"); |
| 1016 | domains.contains(&"github.com"), | ||
| 1017 | "Should include github.com" | ||
| 1018 | ); | ||
| 1019 | assert!( | 1022 | assert!( |
| 1020 | domains.contains(&"pr-server.com"), | 1023 | domains.contains(&"pr-server.com"), |
| 1021 | "Should include pr-server.com from PR clone URLs" | 1024 | "Should include pr-server.com from PR clone URLs" |