From b71111cc25b99acab786ece4607cb60e9cbebae4 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Wed, 7 Jan 2026 17:13:17 +0000 Subject: feat(sync): extract clone URLs from PR events in purgatory Add support for extracting clone URLs from PR/PR-Update events (kind 1618/1619) during purgatory sync, per NIP-34 specification. This enables fetching PR commits from URLs specified in the PR event itself, not just from repository announcement clone URLs. Changes: - Add collect_pr_clone_urls() to SyncContext trait - Implement in RealSyncContext: extract clone tags from PR events in purgatory - Implement in MockSyncContext: configurable PR clone URLs for testing - Update sync_identifier_next_url to merge PR clone URLs with announcement URLs - Update get_throttled_domains_with_untried_urls with same merge logic - Add unit tests for PR clone URL extraction and filtering --- src/purgatory/sync/context.rs | 77 ++++++++++++++- src/purgatory/sync/functions.rs | 206 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 278 insertions(+), 5 deletions(-) (limited to 'src/purgatory/sync') diff --git a/src/purgatory/sync/context.rs b/src/purgatory/sync/context.rs index e97b708..2922f10 100644 --- a/src/purgatory/sync/context.rs +++ b/src/purgatory/sync/context.rs @@ -63,6 +63,18 @@ impl ProcessResult { /// with mocks. #[async_trait] pub trait SyncContext: Send + Sync { + /// Collect clone URLs from PR events in purgatory for a given identifier. + /// + /// PR events (kind 1618) and PR Update events (kind 1619) can include `clone` tags + /// specifying where the PR commits can be fetched from. This method extracts those + /// URLs to supplement the clone URLs from repository announcements. + /// + /// # Arguments + /// * `identifier` - The repository identifier + /// + /// # Returns + /// Set of clone URLs from PR events in purgatory for this identifier + fn collect_pr_clone_urls(&self, identifier: &str) -> HashSet; /// Get repository data (announcements, clone URLs, etc.) from the database. /// /// # Arguments @@ -232,6 +244,30 @@ impl RealSyncContext { #[async_trait] impl SyncContext for RealSyncContext { + fn collect_pr_clone_urls(&self, identifier: &str) -> HashSet { + let mut urls = HashSet::new(); + + for entry in self.purgatory.find_prs_for_identifier(identifier) { + if let Some(ref event) = entry.event { + for tag in event.tags.iter() { + let tag_vec = tag.clone().to_vec(); + if tag_vec.len() >= 2 && tag_vec[0] == "clone" { + // Clone tags can have multiple URLs: ["clone", "url1", "url2", ...] + urls.extend(tag_vec[1..].iter().cloned()); + } + } + } + } + + debug!( + identifier = %identifier, + pr_clone_urls_count = urls.len(), + "Collected clone URLs from PR events in purgatory" + ); + + urls + } + async fn fetch_repository_data(&self, identifier: &str) -> Result { crate::git::authorization::fetch_repository_data(&self.database, identifier).await } @@ -450,9 +486,12 @@ pub mod mock { /// Repository data to return from fetch_repository_data repo_data: RwLock>, - /// Clone URLs available for the repository + /// Clone URLs available for the repository (from announcements) clone_urls: Vec, + /// Clone URLs from PR events in purgatory + pr_clone_urls: HashSet, + /// OIDs still needed (decremented when "fetched") needed_oids: RwLock>, @@ -490,6 +529,7 @@ pub mod mock { Self { repo_data: RwLock::new(None), clone_urls: Vec::new(), + pr_clone_urls: HashSet::new(), needed_oids: RwLock::new(HashSet::new()), url_provides_oids: HashMap::new(), fetch_log: RwLock::new(Vec::new()), @@ -501,12 +541,18 @@ pub mod mock { } } - /// Configure clone URLs for the repository. + /// Configure clone URLs for the repository (from announcements). pub fn with_urls(mut self, urls: &[&str]) -> Self { self.clone_urls = urls.iter().map(|s| s.to_string()).collect(); self } + /// Configure clone URLs from PR events in purgatory. + pub fn with_pr_clone_urls(mut self, urls: &[&str]) -> Self { + self.pr_clone_urls = urls.iter().map(|s| s.to_string()).collect(); + self + } + /// Configure OIDs that are still needed. pub fn with_needed_oids(self, oids: &[&str]) -> Self { *self.needed_oids.write().unwrap() = oids.iter().map(|s| s.to_string()).collect(); @@ -580,6 +626,10 @@ pub mod mock { #[async_trait] impl SyncContext for MockSyncContext { + fn collect_pr_clone_urls(&self, _identifier: &str) -> HashSet { + self.pr_clone_urls.clone() + } + async fn fetch_repository_data(&self, _identifier: &str) -> Result { // Return stored repo_data or create a minimal one with clone URLs if let Some(data) = self.repo_data.read().unwrap().as_ref() { @@ -791,5 +841,28 @@ pub mod mock { mock.set_pending_events(false); assert!(!mock.has_pending_events("test-repo")); } + + #[test] + fn mock_collect_pr_clone_urls_returns_configured_urls() { + let mock = MockSyncContext::new().with_pr_clone_urls(&[ + "https://fork-server.com/repo.git", + "https://another-fork.com/repo.git", + ]); + + let urls = mock.collect_pr_clone_urls("any-identifier"); + + assert_eq!(urls.len(), 2); + assert!(urls.contains("https://fork-server.com/repo.git")); + assert!(urls.contains("https://another-fork.com/repo.git")); + } + + #[test] + fn mock_collect_pr_clone_urls_empty_by_default() { + let mock = MockSyncContext::new(); + + let urls = mock.collect_pr_clone_urls("any-identifier"); + + assert!(urls.is_empty()); + } } } diff --git a/src/purgatory/sync/functions.rs b/src/purgatory/sync/functions.rs index 13b2e47..1b9518d 100644 --- a/src/purgatory/sync/functions.rs +++ b/src/purgatory/sync/functions.rs @@ -110,13 +110,24 @@ pub async fn sync_identifier_next_url( } }; - // 4. Collect clone URLs, excluding our domain + // 4. Collect clone URLs from announcements AND PR events in purgatory let our_domain = ctx.our_domain(); - let all_urls: HashSet = repo_data + + // Get clone URLs from repository announcements + let announcement_urls: HashSet = repo_data .announcements .iter() .flat_map(|a| a.clone_urls.iter().cloned()) + .collect(); + + // Get clone URLs from PR events in purgatory + let pr_urls = ctx.collect_pr_clone_urls(identifier); + + // Merge and filter out our domain + let all_urls: HashSet = announcement_urls + .union(&pr_urls) .filter(|url| our_domain.map_or(true, |d| !url.contains(d))) + .cloned() .collect(); if all_urls.is_empty() { @@ -206,11 +217,22 @@ pub async fn get_throttled_domains_with_untried_urls( }; let our_domain = ctx.our_domain(); - let all_urls: HashSet = repo_data + + // Get clone URLs from repository announcements + let announcement_urls: HashSet = repo_data .announcements .iter() .flat_map(|a| a.clone_urls.iter().cloned()) + .collect(); + + // Get clone URLs from PR events in purgatory + let pr_urls = ctx.collect_pr_clone_urls(identifier); + + // Merge and filter out our domain + let all_urls: HashSet = announcement_urls + .union(&pr_urls) .filter(|url| our_domain.map_or(true, |d| !url.contains(d))) + .cloned() .collect(); let urls_by_domain: HashMap> = @@ -855,4 +877,182 @@ mod tests { "Expected github.com to still be throttled" ); } + + // ========================================================================= + // PR Clone URL Tests + // ========================================================================= + + #[tokio::test] + async fn test_collect_pr_clone_urls_returns_configured_urls() { + // Test that MockSyncContext returns configured PR clone URLs + let mock = MockSyncContext::new() + .with_pr_clone_urls(&[ + "https://pr-server.com/fork.git", + "https://another-server.com/fork.git", + ]); + + let pr_urls = mock.collect_pr_clone_urls("test-repo"); + + assert_eq!(pr_urls.len(), 2); + assert!(pr_urls.contains("https://pr-server.com/fork.git")); + assert!(pr_urls.contains("https://another-server.com/fork.git")); + } + + #[tokio::test] + async fn test_sync_identifier_next_url_includes_pr_clone_urls() { + // Set up mock with announcement URLs and PR clone URLs + let mock = MockSyncContext::new() + .with_urls(&["https://github.com/owner/repo.git"]) // From announcement + .with_pr_clone_urls(&["https://pr-author.com/fork.git"]) // From PR event + .with_needed_oids(&["abc123"]) + .with_pending_events(true); + + let throttle_manager = ThrottleManager::new(5, 100); + let tried_urls = HashSet::new(); + + // Get first URL + let first_url = + sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) + .await + .expect("Should return a URL"); + + // Try the first URL + let mut tried = HashSet::new(); + tried.insert(first_url.clone()); + + // Get second URL + let second_url = + sync_identifier_next_url(&mock, "test-repo", None, &tried, &throttle_manager) + .await + .expect("Should return a second URL"); + + // Both URLs should be available (one from announcement, one from PR) + let both_urls = vec![first_url, second_url]; + assert!( + both_urls.iter().any(|u| u.contains("github.com")), + "Should include announcement URL" + ); + assert!( + both_urls.iter().any(|u| u.contains("pr-author.com")), + "Should include PR clone URL" + ); + } + + #[tokio::test] + async fn test_pr_clone_urls_filtered_by_our_domain() { + // Set up mock with PR clone URL pointing to our domain + let mock = MockSyncContext::new() + .with_urls(&["https://github.com/owner/repo.git"]) + .with_pr_clone_urls(&[ + "https://our-relay.com/fork.git", // Should be filtered + "https://external.com/fork.git", // Should be included + ]) + .with_our_domain("our-relay.com") + .with_needed_oids(&["abc123"]) + .with_pending_events(true); + + let throttle_manager = ThrottleManager::new(5, 100); + let mut tried_urls = HashSet::new(); + + // Collect all available URLs + let mut available_urls = Vec::new(); + while let Some(url) = + sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) + .await + { + available_urls.push(url.clone()); + tried_urls.insert(url); + } + + // Should have 2 URLs (github.com and external.com), not 3 + assert_eq!( + available_urls.len(), + 2, + "Expected 2 URLs after filtering our domain, got: {:?}", + available_urls + ); + + // our-relay.com should be filtered out + assert!( + !available_urls.iter().any(|u| u.contains("our-relay.com")), + "Our domain should be filtered out" + ); + + // github.com and external.com should be present + assert!( + available_urls.iter().any(|u| u.contains("github.com")), + "github.com should be present" + ); + assert!( + available_urls.iter().any(|u| u.contains("external.com")), + "external.com should be present" + ); + } + + #[tokio::test] + async fn test_get_throttled_domains_includes_pr_clone_urls() { + // Set up mock with throttled PR clone URL domain + let mock = MockSyncContext::new() + .with_urls(&["https://github.com/owner/repo.git"]) + .with_pr_clone_urls(&["https://pr-server.com/fork.git"]) + .with_needed_oids(&["abc123"]) + .with_pending_events(true); + + let throttle_manager = ThrottleManager::new(1, 100); + + // Throttle both domains + throttle_manager.start_request("github.com"); + throttle_manager.start_request("pr-server.com"); + + let tried_urls = HashSet::new(); + + let throttled = + get_throttled_domains_with_untried_urls(&mock, "test-repo", &tried_urls, &throttle_manager) + .await; + + // Should include both throttled domains + let domains: Vec<&str> = throttled.iter().map(|t| t.domain.as_str()).collect(); + assert!( + domains.contains(&"github.com"), + "Should include github.com" + ); + assert!( + domains.contains(&"pr-server.com"), + "Should include pr-server.com from PR clone URLs" + ); + } + + #[tokio::test] + async fn test_sync_identifier_uses_pr_clone_urls_when_announcement_urls_fail() { + // Set up mock where only PR clone URL can provide the needed OID + let mock = MockSyncContext::new() + .with_urls(&["https://github.com/owner/repo.git"]) // Doesn't have the OID + .with_pr_clone_urls(&["https://pr-author.com/fork.git"]) // Has the OID + .with_needed_oids(&["pr-commit-123"]) + .with_pending_events(true) + .url_provides("https://pr-author.com/fork.git", &["pr-commit-123"]); + // Note: github.com doesn't provide any OIDs + + let throttle_manager = Arc::new(ThrottleManager::new(5, 100)); + + // Run sync_identifier + let complete = sync_identifier(&mock, "test-repo", &throttle_manager).await; + + // Should complete successfully using PR clone URL + assert!(complete, "Sync should complete using PR clone URL"); + + // Verify PR clone URL was tried + let fetch_log = mock.fetch_log(); + assert!( + fetch_log.iter().any(|u| u.contains("pr-author.com")), + "PR clone URL should have been tried: {:?}", + fetch_log + ); + + // OID should be fetched + assert!( + mock.current_needed_oids().is_empty(), + "OID should be fetched from PR clone URL" + ); + } } -- cgit v1.2.3