From b71111cc25b99acab786ece4607cb60e9cbebae4 Mon Sep 17 00:00:00 2001 From: DanConwayDev Date: Wed, 7 Jan 2026 17:13:17 +0000 Subject: feat(sync): extract clone URLs from PR events in purgatory Add support for extracting clone URLs from PR/PR-Update events (kind 1618/1619) during purgatory sync, per NIP-34 specification. This enables fetching PR commits from URLs specified in the PR event itself, not just from repository announcement clone URLs. Changes: - Add collect_pr_clone_urls() to SyncContext trait - Implement in RealSyncContext: extract clone tags from PR events in purgatory - Implement in MockSyncContext: configurable PR clone URLs for testing - Update sync_identifier_next_url to merge PR clone URLs with announcement URLs - Update get_throttled_domains_with_untried_urls with same merge logic - Add unit tests for PR clone URL extraction and filtering --- src/purgatory/sync/context.rs | 77 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) (limited to 'src/purgatory/sync/context.rs') diff --git a/src/purgatory/sync/context.rs b/src/purgatory/sync/context.rs index e97b708..2922f10 100644 --- a/src/purgatory/sync/context.rs +++ b/src/purgatory/sync/context.rs @@ -63,6 +63,18 @@ impl ProcessResult { /// with mocks. #[async_trait] pub trait SyncContext: Send + Sync { + /// Collect clone URLs from PR events in purgatory for a given identifier. + /// + /// PR events (kind 1618) and PR Update events (kind 1619) can include `clone` tags + /// specifying where the PR commits can be fetched from. This method extracts those + /// URLs to supplement the clone URLs from repository announcements. + /// + /// # Arguments + /// * `identifier` - The repository identifier + /// + /// # Returns + /// Set of clone URLs from PR events in purgatory for this identifier + fn collect_pr_clone_urls(&self, identifier: &str) -> HashSet; /// Get repository data (announcements, clone URLs, etc.) from the database. /// /// # Arguments @@ -232,6 +244,30 @@ impl RealSyncContext { #[async_trait] impl SyncContext for RealSyncContext { + fn collect_pr_clone_urls(&self, identifier: &str) -> HashSet { + let mut urls = HashSet::new(); + + for entry in self.purgatory.find_prs_for_identifier(identifier) { + if let Some(ref event) = entry.event { + for tag in event.tags.iter() { + let tag_vec = tag.clone().to_vec(); + if tag_vec.len() >= 2 && tag_vec[0] == "clone" { + // Clone tags can have multiple URLs: ["clone", "url1", "url2", ...] + urls.extend(tag_vec[1..].iter().cloned()); + } + } + } + } + + debug!( + identifier = %identifier, + pr_clone_urls_count = urls.len(), + "Collected clone URLs from PR events in purgatory" + ); + + urls + } + async fn fetch_repository_data(&self, identifier: &str) -> Result { crate::git::authorization::fetch_repository_data(&self.database, identifier).await } @@ -450,9 +486,12 @@ pub mod mock { /// Repository data to return from fetch_repository_data repo_data: RwLock>, - /// Clone URLs available for the repository + /// Clone URLs available for the repository (from announcements) clone_urls: Vec, + /// Clone URLs from PR events in purgatory + pr_clone_urls: HashSet, + /// OIDs still needed (decremented when "fetched") needed_oids: RwLock>, @@ -490,6 +529,7 @@ pub mod mock { Self { repo_data: RwLock::new(None), clone_urls: Vec::new(), + pr_clone_urls: HashSet::new(), needed_oids: RwLock::new(HashSet::new()), url_provides_oids: HashMap::new(), fetch_log: RwLock::new(Vec::new()), @@ -501,12 +541,18 @@ pub mod mock { } } - /// Configure clone URLs for the repository. + /// Configure clone URLs for the repository (from announcements). pub fn with_urls(mut self, urls: &[&str]) -> Self { self.clone_urls = urls.iter().map(|s| s.to_string()).collect(); self } + /// Configure clone URLs from PR events in purgatory. + pub fn with_pr_clone_urls(mut self, urls: &[&str]) -> Self { + self.pr_clone_urls = urls.iter().map(|s| s.to_string()).collect(); + self + } + /// Configure OIDs that are still needed. pub fn with_needed_oids(self, oids: &[&str]) -> Self { *self.needed_oids.write().unwrap() = oids.iter().map(|s| s.to_string()).collect(); @@ -580,6 +626,10 @@ pub mod mock { #[async_trait] impl SyncContext for MockSyncContext { + fn collect_pr_clone_urls(&self, _identifier: &str) -> HashSet { + self.pr_clone_urls.clone() + } + async fn fetch_repository_data(&self, _identifier: &str) -> Result { // Return stored repo_data or create a minimal one with clone URLs if let Some(data) = self.repo_data.read().unwrap().as_ref() { @@ -791,5 +841,28 @@ pub mod mock { mock.set_pending_events(false); assert!(!mock.has_pending_events("test-repo")); } + + #[test] + fn mock_collect_pr_clone_urls_returns_configured_urls() { + let mock = MockSyncContext::new().with_pr_clone_urls(&[ + "https://fork-server.com/repo.git", + "https://another-fork.com/repo.git", + ]); + + let urls = mock.collect_pr_clone_urls("any-identifier"); + + assert_eq!(urls.len(), 2); + assert!(urls.contains("https://fork-server.com/repo.git")); + assert!(urls.contains("https://another-fork.com/repo.git")); + } + + #[test] + fn mock_collect_pr_clone_urls_empty_by_default() { + let mock = MockSyncContext::new(); + + let urls = mock.collect_pr_clone_urls("any-identifier"); + + assert!(urls.is_empty()); + } } } -- cgit v1.2.3