diff options
| author | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 02:14:01 +0000 |
|---|---|---|
| committer | DanConwayDev <DanConwayDev@protonmail.com> | 2026-01-10 02:15:22 +0000 |
| commit | 1b6b669b9b82d1f81b887a32055f19c53d3bb8bf (patch) | |
| tree | 3ce1785757cb3f16dfa30d74557042973d3bf53f /src | |
| parent | 730f430c906c6c2d43ea8f2e5fc3b408a3de128b (diff) | |
Add naughty list for git remotes with persistent SSL/DNS errors
Implement domain-level naughty list tracking for git remotes, reusing the
existing NaughtyListTracker from relay sync. This prevents repeated attempts
to fetch from git domains with persistent infrastructure issues (SSL/TLS
certificate errors, DNS failures).
Changes:
- Updated NaughtyListTracker to track both relay URLs and git domains
- Added git_naughty_list field to RealSyncContext for error classification
- Modified fetch_oids() to classify git fetch errors and record naughty domains
- Updated sync_identifier_next_url() to filter out naughty domains during URL selection
- Added git_naughty_list parameter to ThrottleManager for domain queue processing
- Threaded naughty list through start_sync_loop and all sync functions
- Updated all tests to pass naughty list parameter
The naughty list uses 12-hour expiration (configurable) to allow domains to
recover from infrastructure issues. First occurrence logs WARN, repeats log DEBUG.
Diffstat (limited to 'src')
| -rw-r--r-- | src/main.rs | 14 | ||||
| -rw-r--r-- | src/purgatory/sync/context.rs | 38 | ||||
| -rw-r--r-- | src/purgatory/sync/functions.rs | 177 | ||||
| -rw-r--r-- | src/purgatory/sync/loop.rs | 15 | ||||
| -rw-r--r-- | src/purgatory/sync/throttle.rs | 35 | ||||
| -rw-r--r-- | src/sync/naughty_list.rs | 74 |
6 files changed, 275 insertions, 78 deletions
diff --git a/src/main.rs b/src/main.rs index 5e9e2d0..44545b5 100644 --- a/src/main.rs +++ b/src/main.rs | |||
| @@ -12,7 +12,7 @@ use ngit_grasp::{ | |||
| 12 | metrics::Metrics, | 12 | metrics::Metrics, |
| 13 | nostr, | 13 | nostr, |
| 14 | purgatory::{sync::RealSyncContext, sync::ThrottleManager, Purgatory}, | 14 | purgatory::{sync::RealSyncContext, sync::ThrottleManager, Purgatory}, |
| 15 | sync::SyncManager, | 15 | sync::{naughty_list::NaughtyListTracker, SyncManager}, |
| 16 | }; | 16 | }; |
| 17 | 17 | ||
| 18 | #[tokio::main] | 18 | #[tokio::main] |
| @@ -132,23 +132,29 @@ async fn main() -> Result<()> { | |||
| 132 | info!("Expired event cleanup task started (24h interval, keeps 7 days)"); | 132 | info!("Expired event cleanup task started (24h interval, keeps 7 days)"); |
| 133 | 133 | ||
| 134 | // Start purgatory sync loop for background git data fetching | 134 | // Start purgatory sync loop for background git data fetching |
| 135 | // Create naughty list tracker for git remote domains with persistent errors (12h expiration) | ||
| 136 | let git_naughty_list = Arc::new(NaughtyListTracker::with_defaults()); | ||
| 137 | |||
| 135 | let sync_ctx = Arc::new(RealSyncContext::new( | 138 | let sync_ctx = Arc::new(RealSyncContext::new( |
| 136 | purgatory.clone(), | 139 | purgatory.clone(), |
| 137 | relay_with_db.database.clone(), | 140 | relay_with_db.database.clone(), |
| 138 | PathBuf::from(config.effective_git_data_path()), | 141 | PathBuf::from(config.effective_git_data_path()), |
| 139 | Some(config.domain.clone()), | 142 | Some(config.domain.clone()), |
| 140 | Some(relay_with_db.relay.clone()), | 143 | Some(relay_with_db.relay.clone()), |
| 144 | git_naughty_list.clone(), | ||
| 141 | )); | 145 | )); |
| 142 | 146 | ||
| 143 | // Create throttle manager for rate limiting remote git servers | 147 | // Create throttle manager for rate limiting remote git servers |
| 144 | // Default: 5 concurrent requests per domain, 30 requests per minute per domain | 148 | // Default: 5 concurrent requests per domain, 30 requests per minute per domain |
| 145 | let throttle_manager = Arc::new(ThrottleManager::new(5, 30)); | 149 | let throttle_manager = Arc::new(ThrottleManager::new(5, 30)); |
| 146 | throttle_manager.set_context(sync_ctx.clone()); | 150 | throttle_manager.set_context(sync_ctx.clone()); |
| 151 | throttle_manager.set_git_naughty_list(git_naughty_list.clone()); | ||
| 147 | 152 | ||
| 148 | // Start the sync loop | 153 | // Start the sync loop |
| 149 | let _sync_loop_handle = purgatory | 154 | let _sync_loop_handle = |
| 150 | .clone() | 155 | purgatory |
| 151 | .start_sync_loop(sync_ctx, throttle_manager); | 156 | .clone() |
| 157 | .start_sync_loop(sync_ctx, throttle_manager, git_naughty_list.clone()); | ||
| 152 | info!("Purgatory sync loop started (1s interval)"); | 158 | info!("Purgatory sync loop started (1s interval)"); |
| 153 | 159 | ||
| 154 | // Setup shutdown handler for purgatory cleanup | 160 | // Setup shutdown handler for purgatory cleanup |
diff --git a/src/purgatory/sync/context.rs b/src/purgatory/sync/context.rs index e61de01..3c2c683 100644 --- a/src/purgatory/sync/context.rs +++ b/src/purgatory/sync/context.rs | |||
| @@ -187,6 +187,9 @@ use tracing::debug; | |||
| 187 | use crate::nostr::builder::SharedDatabase; | 187 | use crate::nostr::builder::SharedDatabase; |
| 188 | use crate::nostr::events::RepositoryState; | 188 | use crate::nostr::events::RepositoryState; |
| 189 | use crate::purgatory::Purgatory; | 189 | use crate::purgatory::Purgatory; |
| 190 | use crate::sync::naughty_list::NaughtyListTracker; | ||
| 191 | |||
| 192 | use super::functions::extract_domain; | ||
| 190 | 193 | ||
| 191 | /// Real implementation of `SyncContext` that connects to actual systems. | 194 | /// Real implementation of `SyncContext` that connects to actual systems. |
| 192 | /// | 195 | /// |
| @@ -210,6 +213,9 @@ pub struct RealSyncContext { | |||
| 210 | 213 | ||
| 211 | /// Local relay for notifying WebSocket subscribers | 214 | /// Local relay for notifying WebSocket subscribers |
| 212 | local_relay: Option<LocalRelay>, | 215 | local_relay: Option<LocalRelay>, |
| 216 | |||
| 217 | /// Naughty list tracker for git remote domains with persistent errors | ||
| 218 | git_naughty_list: Arc<NaughtyListTracker>, | ||
| 213 | } | 219 | } |
| 214 | 220 | ||
| 215 | impl RealSyncContext { | 221 | impl RealSyncContext { |
| @@ -221,12 +227,14 @@ impl RealSyncContext { | |||
| 221 | /// * `git_data_path` - Base path for git repositories | 227 | /// * `git_data_path` - Base path for git repositories |
| 222 | /// * `our_domain` - Our domain to exclude from clone URLs | 228 | /// * `our_domain` - Our domain to exclude from clone URLs |
| 223 | /// * `local_relay` - Local relay for WebSocket notifications | 229 | /// * `local_relay` - Local relay for WebSocket notifications |
| 230 | /// * `git_naughty_list` - Naughty list tracker for git remote domains | ||
| 224 | pub fn new( | 231 | pub fn new( |
| 225 | purgatory: Arc<Purgatory>, | 232 | purgatory: Arc<Purgatory>, |
| 226 | database: SharedDatabase, | 233 | database: SharedDatabase, |
| 227 | git_data_path: PathBuf, | 234 | git_data_path: PathBuf, |
| 228 | our_domain: Option<String>, | 235 | our_domain: Option<String>, |
| 229 | local_relay: Option<LocalRelay>, | 236 | local_relay: Option<LocalRelay>, |
| 237 | git_naughty_list: Arc<NaughtyListTracker>, | ||
| 230 | ) -> Self { | 238 | ) -> Self { |
| 231 | Self { | 239 | Self { |
| 232 | purgatory, | 240 | purgatory, |
| @@ -234,8 +242,14 @@ impl RealSyncContext { | |||
| 234 | git_data_path, | 242 | git_data_path, |
| 235 | our_domain_value: our_domain, | 243 | our_domain_value: our_domain, |
| 236 | local_relay, | 244 | local_relay, |
| 245 | git_naughty_list, | ||
| 237 | } | 246 | } |
| 238 | } | 247 | } |
| 248 | |||
| 249 | /// Get reference to the git naughty list tracker | ||
| 250 | pub fn git_naughty_list(&self) -> &Arc<NaughtyListTracker> { | ||
| 251 | &self.git_naughty_list | ||
| 252 | } | ||
| 239 | } | 253 | } |
| 240 | 254 | ||
| 241 | #[async_trait] | 255 | #[async_trait] |
| @@ -344,6 +358,7 @@ impl SyncContext for RealSyncContext { | |||
| 344 | let repo_path = repo_path.to_path_buf(); | 358 | let repo_path = repo_path.to_path_buf(); |
| 345 | let url = url.to_string(); | 359 | let url = url.to_string(); |
| 346 | let missing_oids: Vec<String> = missing.into_iter().cloned().collect(); | 360 | let missing_oids: Vec<String> = missing.into_iter().cloned().collect(); |
| 361 | let naughty_list = self.git_naughty_list.clone(); | ||
| 347 | 362 | ||
| 348 | tokio::task::spawn_blocking(move || -> Result<Vec<String>> { | 363 | tokio::task::spawn_blocking(move || -> Result<Vec<String>> { |
| 349 | // git fetch <remote> <sha1> <sha2> ... - fetch all OIDs in one command | 364 | // git fetch <remote> <sha1> <sha2> ... - fetch all OIDs in one command |
| @@ -370,6 +385,29 @@ impl SyncContext for RealSyncContext { | |||
| 370 | } | 385 | } |
| 371 | Ok(result) => { | 386 | Ok(result) => { |
| 372 | let stderr = String::from_utf8_lossy(&result.stderr); | 387 | let stderr = String::from_utf8_lossy(&result.stderr); |
| 388 | |||
| 389 | // Extract domain and classify error for naughty list | ||
| 390 | if let Some(domain) = extract_domain(&url) { | ||
| 391 | if let Some(category) = NaughtyListTracker::classify_error(&stderr) { | ||
| 392 | let is_new = naughty_list.record(&domain, category, stderr.to_string()); | ||
| 393 | |||
| 394 | if is_new { | ||
| 395 | tracing::warn!( | ||
| 396 | domain = %domain, | ||
| 397 | category = %category, | ||
| 398 | error = %stderr, | ||
| 399 | "Git remote domain added to naughty list" | ||
| 400 | ); | ||
| 401 | } else { | ||
| 402 | debug!( | ||
| 403 | domain = %domain, | ||
| 404 | category = %category, | ||
| 405 | "Git remote domain still on naughty list" | ||
| 406 | ); | ||
| 407 | } | ||
| 408 | } | ||
| 409 | } | ||
| 410 | |||
| 373 | Err(anyhow::anyhow!("git fetch failed: {}", stderr)) | 411 | Err(anyhow::anyhow!("git fetch failed: {}", stderr)) |
| 374 | } | 412 | } |
| 375 | Err(e) => Err(anyhow::anyhow!("git fetch command error: {}", e)), | 413 | Err(e) => Err(anyhow::anyhow!("git fetch command error: {}", e)), |
diff --git a/src/purgatory/sync/functions.rs b/src/purgatory/sync/functions.rs index 0139ac5..65d29af 100644 --- a/src/purgatory/sync/functions.rs +++ b/src/purgatory/sync/functions.rs | |||
| @@ -17,6 +17,7 @@ use tracing::debug; | |||
| 17 | 17 | ||
| 18 | use super::context::SyncContext; | 18 | use super::context::SyncContext; |
| 19 | use super::throttle::ThrottleManager; | 19 | use super::throttle::ThrottleManager; |
| 20 | use crate::sync::naughty_list::NaughtyListTracker; | ||
| 20 | 21 | ||
| 21 | /// Extract domain from a URL. | 22 | /// Extract domain from a URL. |
| 22 | /// | 23 | /// |
| @@ -29,7 +30,7 @@ use super::throttle::ThrottleManager; | |||
| 29 | /// assert_eq!(extract_domain("http://example.com:8080/repo.git"), Some("example.com".to_string())); | 30 | /// assert_eq!(extract_domain("http://example.com:8080/repo.git"), Some("example.com".to_string())); |
| 30 | /// assert_eq!(extract_domain("git@github.com:foo/bar.git"), None); // SSH URLs not supported | 31 | /// assert_eq!(extract_domain("git@github.com:foo/bar.git"), None); // SSH URLs not supported |
| 31 | /// ``` | 32 | /// ``` |
| 32 | fn extract_domain(url: &str) -> Option<String> { | 33 | pub(crate) fn extract_domain(url: &str) -> Option<String> { |
| 33 | // Simple URL parsing for HTTP(S) URLs | 34 | // Simple URL parsing for HTTP(S) URLs |
| 34 | // Format: scheme://[user@]host[:port]/path | 35 | // Format: scheme://[user@]host[:port]/path |
| 35 | let url = url | 36 | let url = url |
| @@ -57,7 +58,8 @@ fn extract_domain(url: &str) -> Option<String> { | |||
| 57 | /// 2. Checks if there are OIDs still needed | 58 | /// 2. Checks if there are OIDs still needed |
| 58 | /// 3. Gets repository data and extracts clone URLs | 59 | /// 3. Gets repository data and extracts clone URLs |
| 59 | /// 4. Filters out our own domain and already-tried URLs | 60 | /// 4. Filters out our own domain and already-tried URLs |
| 60 | /// 5. Returns the first non-throttled URL (when `domain` is None) | 61 | /// 5. Filters out naughty domains (with persistent SSL/DNS errors) |
| 62 | /// 6. Returns the first non-throttled URL (when `domain` is None) | ||
| 61 | /// or a URL from the specified domain (when `domain` is Some) | 63 | /// or a URL from the specified domain (when `domain` is Some) |
| 62 | /// | 64 | /// |
| 63 | /// # Arguments | 65 | /// # Arguments |
| @@ -68,6 +70,7 @@ fn extract_domain(url: &str) -> Option<String> { | |||
| 68 | /// If None, return any non-throttled URL. | 70 | /// If None, return any non-throttled URL. |
| 69 | /// * `tried_urls` - URLs that have already been tried (will be skipped) | 71 | /// * `tried_urls` - URLs that have already been tried (will be skipped) |
| 70 | /// * `throttle_manager` - Used to check if domains are throttled (when domain is None) | 72 | /// * `throttle_manager` - Used to check if domains are throttled (when domain is None) |
| 73 | /// * `git_naughty_list` - Used to filter out domains with persistent errors | ||
| 71 | /// | 74 | /// |
| 72 | /// # Returns | 75 | /// # Returns |
| 73 | /// | 76 | /// |
| @@ -79,6 +82,7 @@ pub async fn sync_identifier_next_url<C: SyncContext + ?Sized>( | |||
| 79 | domain: Option<&str>, | 82 | domain: Option<&str>, |
| 80 | tried_urls: &HashSet<String>, | 83 | tried_urls: &HashSet<String>, |
| 81 | throttle_manager: &ThrottleManager, | 84 | throttle_manager: &ThrottleManager, |
| 85 | git_naughty_list: &NaughtyListTracker, | ||
| 82 | ) -> Option<String> { | 86 | ) -> Option<String> { |
| 83 | // 1. Check if we still have pending events | 87 | // 1. Check if we still have pending events |
| 84 | if !ctx.has_pending_events(identifier) { | 88 | if !ctx.has_pending_events(identifier) { |
| @@ -158,7 +162,7 @@ pub async fn sync_identifier_next_url<C: SyncContext + ?Sized>( | |||
| 158 | .and_then(|urls| urls.iter().find(|url| !tried_urls.contains(*url)).cloned()) | 162 | .and_then(|urls| urls.iter().find(|url| !tried_urls.contains(*url)).cloned()) |
| 159 | } | 163 | } |
| 160 | None => { | 164 | None => { |
| 161 | // Try any non-throttled domain | 165 | // Try any non-throttled, non-naughty domain |
| 162 | for (d, domain_urls) in &urls_by_domain { | 166 | for (d, domain_urls) in &urls_by_domain { |
| 163 | if throttle_manager.is_throttled(d) { | 167 | if throttle_manager.is_throttled(d) { |
| 164 | debug!( | 168 | debug!( |
| @@ -168,6 +172,17 @@ pub async fn sync_identifier_next_url<C: SyncContext + ?Sized>( | |||
| 168 | ); | 172 | ); |
| 169 | continue; | 173 | continue; |
| 170 | } | 174 | } |
| 175 | |||
| 176 | // NEW: Skip naughty domains | ||
| 177 | if git_naughty_list.is_naughty(d) { | ||
| 178 | debug!( | ||
| 179 | identifier = %identifier, | ||
| 180 | domain = %d, | ||
| 181 | "Domain is on git naughty list - skipping" | ||
| 182 | ); | ||
| 183 | continue; | ||
| 184 | } | ||
| 185 | |||
| 171 | if let Some(url) = domain_urls.iter().find(|url| !tried_urls.contains(*url)) { | 186 | if let Some(url) = domain_urls.iter().find(|url| !tried_urls.contains(*url)) { |
| 172 | return Some(url.clone()); | 187 | return Some(url.clone()); |
| 173 | } | 188 | } |
| @@ -200,6 +215,7 @@ pub struct ThrottledDomainInfo { | |||
| 200 | /// * `identifier` - The repository identifier | 215 | /// * `identifier` - The repository identifier |
| 201 | /// * `tried_urls` - All URLs that have been tried (across all domains) | 216 | /// * `tried_urls` - All URLs that have been tried (across all domains) |
| 202 | /// * `throttle_manager` - Used to check which domains are throttled | 217 | /// * `throttle_manager` - Used to check which domains are throttled |
| 218 | /// * `git_naughty_list` - Used to filter out domains with persistent errors | ||
| 203 | /// | 219 | /// |
| 204 | /// # Returns | 220 | /// # Returns |
| 205 | /// | 221 | /// |
| @@ -210,6 +226,7 @@ pub async fn get_throttled_domains_with_untried_urls<C: SyncContext + ?Sized>( | |||
| 210 | identifier: &str, | 226 | identifier: &str, |
| 211 | tried_urls: &HashSet<String>, | 227 | tried_urls: &HashSet<String>, |
| 212 | throttle_manager: &ThrottleManager, | 228 | throttle_manager: &ThrottleManager, |
| 229 | git_naughty_list: &NaughtyListTracker, | ||
| 213 | ) -> Vec<ThrottledDomainInfo> { | 230 | ) -> Vec<ThrottledDomainInfo> { |
| 214 | let repo_data = match ctx.fetch_repository_data(identifier).await { | 231 | let repo_data = match ctx.fetch_repository_data(identifier).await { |
| 215 | Ok(data) => data, | 232 | Ok(data) => data, |
| @@ -250,6 +267,11 @@ pub async fn get_throttled_domains_with_untried_urls<C: SyncContext + ?Sized>( | |||
| 250 | return None; // Not throttled, skip | 267 | return None; // Not throttled, skip |
| 251 | } | 268 | } |
| 252 | 269 | ||
| 270 | // Skip naughty domains | ||
| 271 | if git_naughty_list.is_naughty(&domain) { | ||
| 272 | return None; // On naughty list, skip | ||
| 273 | } | ||
| 274 | |||
| 253 | let untried: Vec<_> = domain_urls | 275 | let untried: Vec<_> = domain_urls |
| 254 | .iter() | 276 | .iter() |
| 255 | .filter(|url| !tried_urls.contains(*url)) | 277 | .filter(|url| !tried_urls.contains(*url)) |
| @@ -388,7 +410,7 @@ pub async fn sync_identifier_from_url<C: SyncContext + ?Sized>( | |||
| 388 | /// Sync git data for an identifier. | 410 | /// Sync git data for an identifier. |
| 389 | /// | 411 | /// |
| 390 | /// This is the main orchestration function called by the sync loop. It: | 412 | /// This is the main orchestration function called by the sync loop. It: |
| 391 | /// 1. Tries all non-throttled URLs in sequence | 413 | /// 1. Tries all non-throttled, non-naughty URLs in sequence |
| 392 | /// 2. After each fetch, checks if sync is complete (no pending events or no needed OIDs) | 414 | /// 2. After each fetch, checks if sync is complete (no pending events or no needed OIDs) |
| 393 | /// 3. When no non-throttled URLs remain, enqueues with throttled domains for later processing | 415 | /// 3. When no non-throttled URLs remain, enqueues with throttled domains for later processing |
| 394 | /// 4. Returns without waiting for throttled domains to complete | 416 | /// 4. Returns without waiting for throttled domains to complete |
| @@ -398,6 +420,7 @@ pub async fn sync_identifier_from_url<C: SyncContext + ?Sized>( | |||
| 398 | /// * `ctx` - The sync context providing repository data and OID information | 420 | /// * `ctx` - The sync context providing repository data and OID information |
| 399 | /// * `identifier` - The repository identifier (d-tag value) | 421 | /// * `identifier` - The repository identifier (d-tag value) |
| 400 | /// * `throttle_manager` - Used for rate limiting and domain queue management | 422 | /// * `throttle_manager` - Used for rate limiting and domain queue management |
| 423 | /// * `git_naughty_list` - Used to filter out domains with persistent errors | ||
| 401 | /// | 424 | /// |
| 402 | /// # Returns | 425 | /// # Returns |
| 403 | /// | 426 | /// |
| @@ -408,6 +431,7 @@ pub async fn sync_identifier<C: SyncContext + ?Sized>( | |||
| 408 | ctx: &C, | 431 | ctx: &C, |
| 409 | identifier: &str, | 432 | identifier: &str, |
| 410 | throttle_manager: &Arc<ThrottleManager>, | 433 | throttle_manager: &Arc<ThrottleManager>, |
| 434 | git_naughty_list: &NaughtyListTracker, | ||
| 411 | ) -> bool { | 435 | ) -> bool { |
| 412 | let mut tried_urls: HashSet<String> = HashSet::new(); | 436 | let mut tried_urls: HashSet<String> = HashSet::new(); |
| 413 | 437 | ||
| @@ -416,9 +440,18 @@ pub async fn sync_identifier<C: SyncContext + ?Sized>( | |||
| 416 | "Starting sync for identifier" | 440 | "Starting sync for identifier" |
| 417 | ); | 441 | ); |
| 418 | 442 | ||
| 419 | // Try all non-throttled URLs | 443 | // Try all non-throttled, non-naughty URLs |
| 420 | loop { | 444 | loop { |
| 421 | match sync_identifier_next_url(ctx, identifier, None, &tried_urls, throttle_manager).await { | 445 | match sync_identifier_next_url( |
| 446 | ctx, | ||
| 447 | identifier, | ||
| 448 | None, | ||
| 449 | &tried_urls, | ||
| 450 | throttle_manager, | ||
| 451 | git_naughty_list, | ||
| 452 | ) | ||
| 453 | .await | ||
| 454 | { | ||
| 422 | Some(url) => { | 455 | Some(url) => { |
| 423 | debug!( | 456 | debug!( |
| 424 | identifier = %identifier, | 457 | identifier = %identifier, |
| @@ -481,9 +514,14 @@ pub async fn sync_identifier<C: SyncContext + ?Sized>( | |||
| 481 | } | 514 | } |
| 482 | 515 | ||
| 483 | // Enqueue with any throttled domains that have untried URLs | 516 | // Enqueue with any throttled domains that have untried URLs |
| 484 | let throttled_domains = | 517 | let throttled_domains = get_throttled_domains_with_untried_urls( |
| 485 | get_throttled_domains_with_untried_urls(ctx, identifier, &tried_urls, throttle_manager) | 518 | ctx, |
| 486 | .await; | 519 | identifier, |
| 520 | &tried_urls, | ||
| 521 | throttle_manager, | ||
| 522 | git_naughty_list, | ||
| 523 | ) | ||
| 524 | .await; | ||
| 487 | 525 | ||
| 488 | for info in throttled_domains { | 526 | for info in throttled_domains { |
| 489 | debug!( | 527 | debug!( |
| @@ -525,15 +563,22 @@ mod tests { | |||
| 525 | 563 | ||
| 526 | // Create throttle manager and throttle github.com | 564 | // Create throttle manager and throttle github.com |
| 527 | let throttle_manager = ThrottleManager::new(1, 100); | 565 | let throttle_manager = ThrottleManager::new(1, 100); |
| 566 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 528 | 567 | ||
| 529 | // Saturate github.com by starting a request | 568 | // Saturate github.com by starting a request |
| 530 | throttle_manager.start_request("github.com"); | 569 | throttle_manager.start_request("github.com"); |
| 531 | 570 | ||
| 532 | // Should return gitlab.com URL since github.com is throttled | 571 | // Should return gitlab.com URL since github.com is throttled |
| 533 | let tried_urls = HashSet::new(); | 572 | let tried_urls = HashSet::new(); |
| 534 | let result = | 573 | let result = sync_identifier_next_url( |
| 535 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 574 | &mock, |
| 536 | .await; | 575 | "test-repo", |
| 576 | None, | ||
| 577 | &tried_urls, | ||
| 578 | &throttle_manager, | ||
| 579 | &naughty_list, | ||
| 580 | ) | ||
| 581 | .await; | ||
| 537 | 582 | ||
| 538 | assert!(result.is_some()); | 583 | assert!(result.is_some()); |
| 539 | let url = result.unwrap(); | 584 | let url = result.unwrap(); |
| @@ -556,15 +601,22 @@ mod tests { | |||
| 556 | .with_pending_events(true); | 601 | .with_pending_events(true); |
| 557 | 602 | ||
| 558 | let throttle_manager = ThrottleManager::new(5, 100); | 603 | let throttle_manager = ThrottleManager::new(5, 100); |
| 604 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 559 | 605 | ||
| 560 | // Mark first URL as tried | 606 | // Mark first URL as tried |
| 561 | let mut tried_urls = HashSet::new(); | 607 | let mut tried_urls = HashSet::new(); |
| 562 | tried_urls.insert("https://github.com/foo/bar.git".to_string()); | 608 | tried_urls.insert("https://github.com/foo/bar.git".to_string()); |
| 563 | 609 | ||
| 564 | // Should return the second URL | 610 | // Should return the second URL |
| 565 | let result = | 611 | let result = sync_identifier_next_url( |
| 566 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 612 | &mock, |
| 567 | .await; | 613 | "test-repo", |
| 614 | None, | ||
| 615 | &tried_urls, | ||
| 616 | &throttle_manager, | ||
| 617 | &naughty_list, | ||
| 618 | ) | ||
| 619 | .await; | ||
| 568 | 620 | ||
| 569 | assert!(result.is_some()); | 621 | assert!(result.is_some()); |
| 570 | let url = result.unwrap(); | 622 | let url = result.unwrap(); |
| @@ -579,11 +631,18 @@ mod tests { | |||
| 579 | .with_pending_events(false); // No pending events | 631 | .with_pending_events(false); // No pending events |
| 580 | 632 | ||
| 581 | let throttle_manager = ThrottleManager::new(5, 100); | 633 | let throttle_manager = ThrottleManager::new(5, 100); |
| 634 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 582 | let tried_urls = HashSet::new(); | 635 | let tried_urls = HashSet::new(); |
| 583 | 636 | ||
| 584 | let result = | 637 | let result = sync_identifier_next_url( |
| 585 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 638 | &mock, |
| 586 | .await; | 639 | "test-repo", |
| 640 | None, | ||
| 641 | &tried_urls, | ||
| 642 | &throttle_manager, | ||
| 643 | &naughty_list, | ||
| 644 | ) | ||
| 645 | .await; | ||
| 587 | 646 | ||
| 588 | assert!(result.is_none()); | 647 | assert!(result.is_none()); |
| 589 | } | 648 | } |
| @@ -596,11 +655,18 @@ mod tests { | |||
| 596 | .with_pending_events(true); | 655 | .with_pending_events(true); |
| 597 | 656 | ||
| 598 | let throttle_manager = ThrottleManager::new(5, 100); | 657 | let throttle_manager = ThrottleManager::new(5, 100); |
| 658 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 599 | let tried_urls = HashSet::new(); | 659 | let tried_urls = HashSet::new(); |
| 600 | 660 | ||
| 601 | let result = | 661 | let result = sync_identifier_next_url( |
| 602 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 662 | &mock, |
| 603 | .await; | 663 | "test-repo", |
| 664 | None, | ||
| 665 | &tried_urls, | ||
| 666 | &throttle_manager, | ||
| 667 | &naughty_list, | ||
| 668 | ) | ||
| 669 | .await; | ||
| 604 | 670 | ||
| 605 | assert!(result.is_none()); | 671 | assert!(result.is_none()); |
| 606 | } | 672 | } |
| @@ -617,11 +683,18 @@ mod tests { | |||
| 617 | .with_our_domain("our-relay.com"); | 683 | .with_our_domain("our-relay.com"); |
| 618 | 684 | ||
| 619 | let throttle_manager = ThrottleManager::new(5, 100); | 685 | let throttle_manager = ThrottleManager::new(5, 100); |
| 686 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 620 | let tried_urls = HashSet::new(); | 687 | let tried_urls = HashSet::new(); |
| 621 | 688 | ||
| 622 | let result = | 689 | let result = sync_identifier_next_url( |
| 623 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 690 | &mock, |
| 624 | .await; | 691 | "test-repo", |
| 692 | None, | ||
| 693 | &tried_urls, | ||
| 694 | &throttle_manager, | ||
| 695 | &naughty_list, | ||
| 696 | ) | ||
| 697 | .await; | ||
| 625 | 698 | ||
| 626 | assert!(result.is_some()); | 699 | assert!(result.is_some()); |
| 627 | let url = result.unwrap(); | 700 | let url = result.unwrap(); |
| @@ -643,6 +716,7 @@ mod tests { | |||
| 643 | .with_pending_events(true); | 716 | .with_pending_events(true); |
| 644 | 717 | ||
| 645 | let throttle_manager = ThrottleManager::new(5, 100); | 718 | let throttle_manager = ThrottleManager::new(5, 100); |
| 719 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 646 | let tried_urls = HashSet::new(); | 720 | let tried_urls = HashSet::new(); |
| 647 | 721 | ||
| 648 | // Request specific domain | 722 | // Request specific domain |
| @@ -652,6 +726,7 @@ mod tests { | |||
| 652 | Some("gitlab.com"), | 726 | Some("gitlab.com"), |
| 653 | &tried_urls, | 727 | &tried_urls, |
| 654 | &throttle_manager, | 728 | &throttle_manager, |
| 729 | &naughty_list, | ||
| 655 | ) | 730 | ) |
| 656 | .await; | 731 | .await; |
| 657 | 732 | ||
| @@ -757,6 +832,7 @@ mod tests { | |||
| 757 | .with_pending_events(true); | 832 | .with_pending_events(true); |
| 758 | 833 | ||
| 759 | let throttle_manager = ThrottleManager::new(1, 100); | 834 | let throttle_manager = ThrottleManager::new(1, 100); |
| 835 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 760 | 836 | ||
| 761 | // Throttle github.com and gitlab.com | 837 | // Throttle github.com and gitlab.com |
| 762 | throttle_manager.start_request("github.com"); | 838 | throttle_manager.start_request("github.com"); |
| @@ -771,6 +847,7 @@ mod tests { | |||
| 771 | "test-repo", | 847 | "test-repo", |
| 772 | &tried_urls, | 848 | &tried_urls, |
| 773 | &throttle_manager, | 849 | &throttle_manager, |
| 850 | &naughty_list, | ||
| 774 | ) | 851 | ) |
| 775 | .await; | 852 | .await; |
| 776 | 853 | ||
| @@ -803,9 +880,10 @@ mod tests { | |||
| 803 | .url_provides("https://server3.com/repo.git", &["ghi789"]); | 880 | .url_provides("https://server3.com/repo.git", &["ghi789"]); |
| 804 | 881 | ||
| 805 | let throttle_manager = Arc::new(ThrottleManager::new(5, 100)); | 882 | let throttle_manager = Arc::new(ThrottleManager::new(5, 100)); |
| 883 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 806 | 884 | ||
| 807 | // Run sync_identifier | 885 | // Run sync_identifier |
| 808 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager).await; | 886 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager, &naughty_list).await; |
| 809 | 887 | ||
| 810 | // Should return true (sync complete) | 888 | // Should return true (sync complete) |
| 811 | assert!(complete, "Expected sync to complete after trying all URLs"); | 889 | assert!(complete, "Expected sync to complete after trying all URLs"); |
| @@ -841,12 +919,13 @@ mod tests { | |||
| 841 | // Note: gitlab.com doesn't provide any OIDs | 919 | // Note: gitlab.com doesn't provide any OIDs |
| 842 | 920 | ||
| 843 | let throttle_manager = Arc::new(ThrottleManager::new(1, 100)); | 921 | let throttle_manager = Arc::new(ThrottleManager::new(1, 100)); |
| 922 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 844 | 923 | ||
| 845 | // Throttle github.com by starting a request | 924 | // Throttle github.com by starting a request |
| 846 | throttle_manager.start_request("github.com"); | 925 | throttle_manager.start_request("github.com"); |
| 847 | 926 | ||
| 848 | // Run sync_identifier | 927 | // Run sync_identifier |
| 849 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager).await; | 928 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager, &naughty_list).await; |
| 850 | 929 | ||
| 851 | // Should return false (sync incomplete - github.com is throttled) | 930 | // Should return false (sync incomplete - github.com is throttled) |
| 852 | assert!( | 931 | assert!( |
| @@ -911,23 +990,36 @@ mod tests { | |||
| 911 | .with_pending_events(true); | 990 | .with_pending_events(true); |
| 912 | 991 | ||
| 913 | let throttle_manager = ThrottleManager::new(5, 100); | 992 | let throttle_manager = ThrottleManager::new(5, 100); |
| 993 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 914 | let tried_urls = HashSet::new(); | 994 | let tried_urls = HashSet::new(); |
| 915 | 995 | ||
| 916 | // Get first URL | 996 | // Get first URL |
| 917 | let first_url = | 997 | let first_url = sync_identifier_next_url( |
| 918 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager) | 998 | &mock, |
| 919 | .await | 999 | "test-repo", |
| 920 | .expect("Should return a URL"); | 1000 | None, |
| 1001 | &tried_urls, | ||
| 1002 | &throttle_manager, | ||
| 1003 | &naughty_list, | ||
| 1004 | ) | ||
| 1005 | .await | ||
| 1006 | .expect("Should return a URL"); | ||
| 921 | 1007 | ||
| 922 | // Try the first URL | 1008 | // Try the first URL |
| 923 | let mut tried = HashSet::new(); | 1009 | let mut tried = HashSet::new(); |
| 924 | tried.insert(first_url.clone()); | 1010 | tried.insert(first_url.clone()); |
| 925 | 1011 | ||
| 926 | // Get second URL | 1012 | // Get second URL |
| 927 | let second_url = | 1013 | let second_url = sync_identifier_next_url( |
| 928 | sync_identifier_next_url(&mock, "test-repo", None, &tried, &throttle_manager) | 1014 | &mock, |
| 929 | .await | 1015 | "test-repo", |
| 930 | .expect("Should return a second URL"); | 1016 | None, |
| 1017 | &tried, | ||
| 1018 | &throttle_manager, | ||
| 1019 | &naughty_list, | ||
| 1020 | ) | ||
| 1021 | .await | ||
| 1022 | .expect("Should return a second URL"); | ||
| 931 | 1023 | ||
| 932 | // Both URLs should be available (one from announcement, one from PR) | 1024 | // Both URLs should be available (one from announcement, one from PR) |
| 933 | let both_urls = [first_url, second_url]; | 1025 | let both_urls = [first_url, second_url]; |
| @@ -955,12 +1047,20 @@ mod tests { | |||
| 955 | .with_pending_events(true); | 1047 | .with_pending_events(true); |
| 956 | 1048 | ||
| 957 | let throttle_manager = ThrottleManager::new(5, 100); | 1049 | let throttle_manager = ThrottleManager::new(5, 100); |
| 1050 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 958 | let mut tried_urls = HashSet::new(); | 1051 | let mut tried_urls = HashSet::new(); |
| 959 | 1052 | ||
| 960 | // Collect all available URLs | 1053 | // Collect all available URLs |
| 961 | let mut available_urls = Vec::new(); | 1054 | let mut available_urls = Vec::new(); |
| 962 | while let Some(url) = | 1055 | while let Some(url) = sync_identifier_next_url( |
| 963 | sync_identifier_next_url(&mock, "test-repo", None, &tried_urls, &throttle_manager).await | 1056 | &mock, |
| 1057 | "test-repo", | ||
| 1058 | None, | ||
| 1059 | &tried_urls, | ||
| 1060 | &throttle_manager, | ||
| 1061 | &naughty_list, | ||
| 1062 | ) | ||
| 1063 | .await | ||
| 964 | { | 1064 | { |
| 965 | available_urls.push(url.clone()); | 1065 | available_urls.push(url.clone()); |
| 966 | tried_urls.insert(url); | 1066 | tried_urls.insert(url); |
| @@ -1001,6 +1101,7 @@ mod tests { | |||
| 1001 | .with_pending_events(true); | 1101 | .with_pending_events(true); |
| 1002 | 1102 | ||
| 1003 | let throttle_manager = ThrottleManager::new(1, 100); | 1103 | let throttle_manager = ThrottleManager::new(1, 100); |
| 1104 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 1004 | 1105 | ||
| 1005 | // Throttle both domains | 1106 | // Throttle both domains |
| 1006 | throttle_manager.start_request("github.com"); | 1107 | throttle_manager.start_request("github.com"); |
| @@ -1013,6 +1114,7 @@ mod tests { | |||
| 1013 | "test-repo", | 1114 | "test-repo", |
| 1014 | &tried_urls, | 1115 | &tried_urls, |
| 1015 | &throttle_manager, | 1116 | &throttle_manager, |
| 1117 | &naughty_list, | ||
| 1016 | ) | 1118 | ) |
| 1017 | .await; | 1119 | .await; |
| 1018 | 1120 | ||
| @@ -1037,9 +1139,10 @@ mod tests { | |||
| 1037 | // Note: github.com doesn't provide any OIDs | 1139 | // Note: github.com doesn't provide any OIDs |
| 1038 | 1140 | ||
| 1039 | let throttle_manager = Arc::new(ThrottleManager::new(5, 100)); | 1141 | let throttle_manager = Arc::new(ThrottleManager::new(5, 100)); |
| 1142 | let naughty_list = NaughtyListTracker::with_defaults(); | ||
| 1040 | 1143 | ||
| 1041 | // Run sync_identifier | 1144 | // Run sync_identifier |
| 1042 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager).await; | 1145 | let complete = sync_identifier(&mock, "test-repo", &throttle_manager, &naughty_list).await; |
| 1043 | 1146 | ||
| 1044 | // Should complete successfully using PR clone URL | 1147 | // Should complete successfully using PR clone URL |
| 1045 | assert!(complete, "Sync should complete using PR clone URL"); | 1148 | assert!(complete, "Sync should complete using PR clone URL"); |
diff --git a/src/purgatory/sync/loop.rs b/src/purgatory/sync/loop.rs index 92e0594..1ab229d 100644 --- a/src/purgatory/sync/loop.rs +++ b/src/purgatory/sync/loop.rs | |||
| @@ -15,6 +15,7 @@ use tokio::task::JoinHandle; | |||
| 15 | use tracing::{debug, info}; | 15 | use tracing::{debug, info}; |
| 16 | 16 | ||
| 17 | use crate::purgatory::Purgatory; | 17 | use crate::purgatory::Purgatory; |
| 18 | use crate::sync::naughty_list::NaughtyListTracker; | ||
| 18 | 19 | ||
| 19 | use super::context::SyncContext; | 20 | use super::context::SyncContext; |
| 20 | use super::functions::sync_identifier; | 21 | use super::functions::sync_identifier; |
| @@ -37,6 +38,7 @@ impl Purgatory { | |||
| 37 | /// # Arguments | 38 | /// # Arguments |
| 38 | /// * `ctx` - The sync context providing repository data and fetch capabilities | 39 | /// * `ctx` - The sync context providing repository data and fetch capabilities |
| 39 | /// * `throttle_manager` - Used for rate limiting and domain queue management | 40 | /// * `throttle_manager` - Used for rate limiting and domain queue management |
| 41 | /// * `git_naughty_list` - Tracker for git remote domains with persistent errors | ||
| 40 | /// | 42 | /// |
| 41 | /// # Returns | 43 | /// # Returns |
| 42 | /// A `JoinHandle` for the background task (can be used to cancel the loop) | 44 | /// A `JoinHandle` for the background task (can be used to cancel the loop) |
| @@ -47,12 +49,13 @@ impl Purgatory { | |||
| 47 | /// let purgatory = Arc::new(Purgatory::new("/data/git")); | 49 | /// let purgatory = Arc::new(Purgatory::new("/data/git")); |
| 48 | /// let ctx = Arc::new(RealSyncContext::new(...)); | 50 | /// let ctx = Arc::new(RealSyncContext::new(...)); |
| 49 | /// let throttle_manager = Arc::new(ThrottleManager::new(5, 30)); | 51 | /// let throttle_manager = Arc::new(ThrottleManager::new(5, 30)); |
| 52 | /// let git_naughty_list = Arc::new(NaughtyListTracker::with_defaults()); | ||
| 50 | /// | 53 | /// |
| 51 | /// // Set context on throttle manager for queue processing | 54 | /// // Set context on throttle manager for queue processing |
| 52 | /// throttle_manager.set_context(ctx.clone()); | 55 | /// throttle_manager.set_context(ctx.clone()); |
| 53 | /// | 56 | /// |
| 54 | /// // Start the sync loop | 57 | /// // Start the sync loop |
| 55 | /// let handle = purgatory.start_sync_loop(ctx, throttle_manager); | 58 | /// let handle = purgatory.start_sync_loop(ctx, throttle_manager, git_naughty_list); |
| 56 | /// | 59 | /// |
| 57 | /// // Later, to stop the loop: | 60 | /// // Later, to stop the loop: |
| 58 | /// handle.abort(); | 61 | /// handle.abort(); |
| @@ -61,6 +64,7 @@ impl Purgatory { | |||
| 61 | self: Arc<Self>, | 64 | self: Arc<Self>, |
| 62 | ctx: Arc<dyn SyncContext>, | 65 | ctx: Arc<dyn SyncContext>, |
| 63 | throttle_manager: Arc<ThrottleManager>, | 66 | throttle_manager: Arc<ThrottleManager>, |
| 67 | git_naughty_list: Arc<NaughtyListTracker>, | ||
| 64 | ) -> JoinHandle<()> { | 68 | ) -> JoinHandle<()> { |
| 65 | info!( | 69 | info!( |
| 66 | "Starting purgatory sync loop (interval: {:?})", | 70 | "Starting purgatory sync loop (interval: {:?})", |
| @@ -121,6 +125,7 @@ impl Purgatory { | |||
| 121 | let purgatory = self.clone(); | 125 | let purgatory = self.clone(); |
| 122 | let ctx = ctx.clone(); | 126 | let ctx = ctx.clone(); |
| 123 | let throttle_manager = throttle_manager.clone(); | 127 | let throttle_manager = throttle_manager.clone(); |
| 128 | let git_naughty_list = git_naughty_list.clone(); | ||
| 124 | let id = identifier.clone(); | 129 | let id = identifier.clone(); |
| 125 | 130 | ||
| 126 | tokio::spawn(async move { | 131 | tokio::spawn(async move { |
| @@ -129,7 +134,13 @@ impl Purgatory { | |||
| 129 | "Starting sync task for identifier" | 134 | "Starting sync task for identifier" |
| 130 | ); | 135 | ); |
| 131 | 136 | ||
| 132 | let complete = sync_identifier(ctx.as_ref(), &id, &throttle_manager).await; | 137 | let complete = sync_identifier( |
| 138 | ctx.as_ref(), | ||
| 139 | &id, | ||
| 140 | &throttle_manager, | ||
| 141 | git_naughty_list.as_ref(), | ||
| 142 | ) | ||
| 143 | .await; | ||
| 133 | 144 | ||
| 134 | // Check final state and update queue | 145 | // Check final state and update queue |
| 135 | if complete || !purgatory.has_pending_events(&id) { | 146 | if complete || !purgatory.has_pending_events(&id) { |
diff --git a/src/purgatory/sync/throttle.rs b/src/purgatory/sync/throttle.rs index ad6e8ea..7f8f636 100644 --- a/src/purgatory/sync/throttle.rs +++ b/src/purgatory/sync/throttle.rs | |||
| @@ -24,6 +24,7 @@ use tracing::debug; | |||
| 24 | 24 | ||
| 25 | use super::context::SyncContext; | 25 | use super::context::SyncContext; |
| 26 | use super::functions::{sync_identifier_from_url, sync_identifier_next_url}; | 26 | use super::functions::{sync_identifier_from_url, sync_identifier_next_url}; |
| 27 | use crate::sync::naughty_list::NaughtyListTracker; | ||
| 27 | 28 | ||
| 28 | /// State for an identifier waiting in a domain's queue. | 29 | /// State for an identifier waiting in a domain's queue. |
| 29 | /// | 30 | /// |
| @@ -265,6 +266,10 @@ pub struct ThrottleManager { | |||
| 265 | /// Sync context for processing queued identifiers. | 266 | /// Sync context for processing queued identifiers. |
| 266 | /// Set once at startup via `set_context()`. | 267 | /// Set once at startup via `set_context()`. |
| 267 | ctx: OnceLock<Arc<dyn SyncContext>>, | 268 | ctx: OnceLock<Arc<dyn SyncContext>>, |
| 269 | |||
| 270 | /// Naughty list tracker for git remote domains with persistent errors. | ||
| 271 | /// Set once at startup via `set_git_naughty_list()`. | ||
| 272 | git_naughty_list: OnceLock<Arc<NaughtyListTracker>>, | ||
| 268 | } | 273 | } |
| 269 | 274 | ||
| 270 | impl ThrottleManager { | 275 | impl ThrottleManager { |
| @@ -279,6 +284,7 @@ impl ThrottleManager { | |||
| 279 | max_concurrent_per_domain: max_concurrent, | 284 | max_concurrent_per_domain: max_concurrent, |
| 280 | max_per_minute_per_domain: max_per_minute, | 285 | max_per_minute_per_domain: max_per_minute, |
| 281 | ctx: OnceLock::new(), | 286 | ctx: OnceLock::new(), |
| 287 | git_naughty_list: OnceLock::new(), | ||
| 282 | } | 288 | } |
| 283 | } | 289 | } |
| 284 | 290 | ||
| @@ -294,6 +300,17 @@ impl ThrottleManager { | |||
| 294 | let _ = self.ctx.set(ctx); | 300 | let _ = self.ctx.set(ctx); |
| 295 | } | 301 | } |
| 296 | 302 | ||
| 303 | /// Set the git naughty list tracker (called once at startup). | ||
| 304 | /// | ||
| 305 | /// The naughty list is used to filter out domains with persistent errors | ||
| 306 | /// during URL selection. | ||
| 307 | /// | ||
| 308 | /// # Arguments | ||
| 309 | /// * `git_naughty_list` - The naughty list tracker | ||
| 310 | pub fn set_git_naughty_list(&self, git_naughty_list: Arc<NaughtyListTracker>) { | ||
| 311 | let _ = self.git_naughty_list.set(git_naughty_list); | ||
| 312 | } | ||
| 313 | |||
| 297 | /// Check if a domain is currently throttled (at capacity). | 314 | /// Check if a domain is currently throttled (at capacity). |
| 298 | /// | 315 | /// |
| 299 | /// Returns true if the domain has no capacity for another request, | 316 | /// Returns true if the domain has no capacity for another request, |
| @@ -479,10 +496,22 @@ impl ThrottleManager { | |||
| 479 | .unwrap_or_default() | 496 | .unwrap_or_default() |
| 480 | }; | 497 | }; |
| 481 | 498 | ||
| 499 | // Get naughty list (should be set at startup) | ||
| 500 | let naughty_list = self | ||
| 501 | .git_naughty_list | ||
| 502 | .get() | ||
| 503 | .expect("git_naughty_list not set"); | ||
| 504 | |||
| 482 | // Get next URL for this identifier on this specific domain | 505 | // Get next URL for this identifier on this specific domain |
| 483 | let url = | 506 | let url = sync_identifier_next_url( |
| 484 | sync_identifier_next_url(ctx.as_ref(), identifier, Some(domain), &tried_urls, self) | 507 | ctx.as_ref(), |
| 485 | .await; | 508 | identifier, |
| 509 | Some(domain), | ||
| 510 | &tried_urls, | ||
| 511 | self, | ||
| 512 | naughty_list.as_ref(), | ||
| 513 | ) | ||
| 514 | .await; | ||
| 486 | 515 | ||
| 487 | match url { | 516 | match url { |
| 488 | Some(url) => { | 517 | Some(url) => { |
diff --git a/src/sync/naughty_list.rs b/src/sync/naughty_list.rs index 311b9bb..35fcc0f 100644 --- a/src/sync/naughty_list.rs +++ b/src/sync/naughty_list.rs | |||
| @@ -1,8 +1,9 @@ | |||
| 1 | //! Naughty List Tracker for Relays with Persistent Infrastructure Issues | 1 | //! Naughty List Tracker for Remote Servers with Persistent Infrastructure Issues |
| 2 | //! | 2 | //! |
| 3 | //! This module tracks relays with persistent configuration/infrastructure problems | 3 | //! This module tracks remote servers (Nostr relays and git remote domains) with |
| 4 | //! (DNS failures, TLS certificate errors, protocol violations) separately from | 4 | //! persistent configuration/infrastructure problems (DNS failures, TLS certificate |
| 5 | //! transient network issues (timeouts, connection refused). | 5 | //! errors, protocol violations) separately from transient network issues (timeouts, |
| 6 | //! connection refused). | ||
| 6 | //! | 7 | //! |
| 7 | //! ## Failure Classification | 8 | //! ## Failure Classification |
| 8 | //! | 9 | //! |
| @@ -23,14 +24,14 @@ | |||
| 23 | use dashmap::DashMap; | 24 | use dashmap::DashMap; |
| 24 | use std::time::Instant; | 25 | use std::time::Instant; |
| 25 | 26 | ||
| 26 | /// Category of persistent relay failure that qualifies for the naughty list | 27 | /// Category of persistent remote server failure that qualifies for the naughty list |
| 27 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | 28 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
| 28 | pub enum NaughtyCategory { | 29 | pub enum NaughtyCategory { |
| 29 | /// DNS lookup failures (domain doesn't resolve) | 30 | /// DNS lookup failures (domain doesn't resolve) |
| 30 | DnsLookupFailed, | 31 | DnsLookupFailed, |
| 31 | /// TLS certificate errors (expired, invalid, mismatch) | 32 | /// TLS certificate errors (expired, invalid, mismatch) |
| 32 | TlsCertificateInvalid, | 33 | TlsCertificateInvalid, |
| 33 | /// WebSocket or Nostr protocol violations | 34 | /// WebSocket or Nostr protocol violations (relay-specific, won't trigger for git) |
| 34 | ProtocolError, | 35 | ProtocolError, |
| 35 | } | 36 | } |
| 36 | 37 | ||
| @@ -51,7 +52,7 @@ impl std::fmt::Display for NaughtyCategory { | |||
| 51 | } | 52 | } |
| 52 | } | 53 | } |
| 53 | 54 | ||
| 54 | /// Naughty list entry for a relay with persistent issues | 55 | /// Naughty list entry for a remote server (relay URL or git domain) with persistent issues |
| 55 | #[derive(Debug, Clone)] | 56 | #[derive(Debug, Clone)] |
| 56 | pub struct NaughtyEntry { | 57 | pub struct NaughtyEntry { |
| 57 | /// Category of the persistent failure | 58 | /// Category of the persistent failure |
| @@ -66,15 +67,19 @@ pub struct NaughtyEntry { | |||
| 66 | pub occurrence_count: u32, | 67 | pub occurrence_count: u32, |
| 67 | } | 68 | } |
| 68 | 69 | ||
| 69 | /// Tracks relays with persistent infrastructure/configuration issues | 70 | /// Tracks remote servers with persistent infrastructure/configuration issues |
| 71 | /// | ||
| 72 | /// Used for both: | ||
| 73 | /// - Nostr relay URLs (e.g., "wss://relay.example.com") | ||
| 74 | /// - Git remote domains (e.g., "git.example.com") | ||
| 70 | /// | 75 | /// |
| 71 | /// Separate from HealthTracker's backoff logic - this is specifically for | 76 | /// Separate from HealthTracker's backoff logic - this is specifically for |
| 72 | /// relays with configuration problems that are unlikely to be fixed quickly. | 77 | /// servers with configuration problems that are unlikely to be fixed quickly. |
| 73 | #[derive(Debug)] | 78 | #[derive(Debug)] |
| 74 | pub struct NaughtyListTracker { | 79 | pub struct NaughtyListTracker { |
| 75 | /// Map of relay URL to naughty entry | 80 | /// Map of relay URL or git domain to naughty entry |
| 76 | entries: DashMap<String, NaughtyEntry>, | 81 | entries: DashMap<String, NaughtyEntry>, |
| 77 | /// How many hours before removing a relay from the naughty list | 82 | /// How many hours before removing a server from the naughty list |
| 78 | expiration_hours: u64, | 83 | expiration_hours: u64, |
| 79 | } | 84 | } |
| 80 | 85 | ||
| @@ -147,21 +152,26 @@ impl NaughtyListTracker { | |||
| 147 | None | 152 | None |
| 148 | } | 153 | } |
| 149 | 154 | ||
| 150 | /// Record a naughty relay (adds new entry or updates existing) | 155 | /// Record a naughty server (adds new entry or updates existing) |
| 151 | /// | 156 | /// |
| 152 | /// # Arguments | 157 | /// # Arguments |
| 153 | /// | 158 | /// |
| 154 | /// * `relay_url` - The relay URL | 159 | /// * `server_url_or_domain` - The relay URL or git domain |
| 155 | /// * `category` - The naughty category | 160 | /// * `category` - The naughty category |
| 156 | /// * `reason` - The full error message | 161 | /// * `reason` - The full error message |
| 157 | /// | 162 | /// |
| 158 | /// # Returns | 163 | /// # Returns |
| 159 | /// | 164 | /// |
| 160 | /// `true` if this is a new naughty entry (first occurrence), `false` if updating existing | 165 | /// `true` if this is a new naughty entry (first occurrence), `false` if updating existing |
| 161 | pub fn record(&self, relay_url: &str, category: NaughtyCategory, reason: String) -> bool { | 166 | pub fn record( |
| 167 | &self, | ||
| 168 | server_url_or_domain: &str, | ||
| 169 | category: NaughtyCategory, | ||
| 170 | reason: String, | ||
| 171 | ) -> bool { | ||
| 162 | let now = Instant::now(); | 172 | let now = Instant::now(); |
| 163 | 173 | ||
| 164 | if let Some(mut entry) = self.entries.get_mut(relay_url) { | 174 | if let Some(mut entry) = self.entries.get_mut(server_url_or_domain) { |
| 165 | // Update existing entry | 175 | // Update existing entry |
| 166 | entry.last_seen = now; | 176 | entry.last_seen = now; |
| 167 | entry.occurrence_count = entry.occurrence_count.saturating_add(1); | 177 | entry.occurrence_count = entry.occurrence_count.saturating_add(1); |
| @@ -170,7 +180,7 @@ impl NaughtyListTracker { | |||
| 170 | } else { | 180 | } else { |
| 171 | // Create new entry | 181 | // Create new entry |
| 172 | self.entries.insert( | 182 | self.entries.insert( |
| 173 | relay_url.to_string(), | 183 | server_url_or_domain.to_string(), |
| 174 | NaughtyEntry { | 184 | NaughtyEntry { |
| 175 | category, | 185 | category, |
| 176 | reason, | 186 | reason, |
| @@ -183,17 +193,17 @@ impl NaughtyListTracker { | |||
| 183 | } | 193 | } |
| 184 | } | 194 | } |
| 185 | 195 | ||
| 186 | /// Check if a relay is on the naughty list (not expired) | 196 | /// Check if a server is on the naughty list (not expired) |
| 187 | /// | 197 | /// |
| 188 | /// # Arguments | 198 | /// # Arguments |
| 189 | /// | 199 | /// |
| 190 | /// * `relay_url` - The relay URL to check | 200 | /// * `server_url_or_domain` - The relay URL or git domain to check |
| 191 | /// | 201 | /// |
| 192 | /// # Returns | 202 | /// # Returns |
| 193 | /// | 203 | /// |
| 194 | /// `true` if the relay is currently on the naughty list | 204 | /// `true` if the server is currently on the naughty list |
| 195 | pub fn is_naughty(&self, relay_url: &str) -> bool { | 205 | pub fn is_naughty(&self, server_url_or_domain: &str) -> bool { |
| 196 | if let Some(entry) = self.entries.get(relay_url) { | 206 | if let Some(entry) = self.entries.get(server_url_or_domain) { |
| 197 | let age = Instant::now().duration_since(entry.first_seen); | 207 | let age = Instant::now().duration_since(entry.first_seen); |
| 198 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); | 208 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); |
| 199 | age < expiration | 209 | age < expiration |
| @@ -206,23 +216,23 @@ impl NaughtyListTracker { | |||
| 206 | /// | 216 | /// |
| 207 | /// # Arguments | 217 | /// # Arguments |
| 208 | /// | 218 | /// |
| 209 | /// * `relay_url` - The relay URL to look up | 219 | /// * `server_url_or_domain` - The relay URL or git domain to look up |
| 210 | /// | 220 | /// |
| 211 | /// # Returns | 221 | /// # Returns |
| 212 | /// | 222 | /// |
| 213 | /// A cloned `NaughtyEntry` if the relay is on the naughty list and not expired | 223 | /// A cloned `NaughtyEntry` if the server is on the naughty list and not expired |
| 214 | pub fn get_entry(&self, relay_url: &str) -> Option<NaughtyEntry> { | 224 | pub fn get_entry(&self, server_url_or_domain: &str) -> Option<NaughtyEntry> { |
| 215 | self.entries.get(relay_url).map(|e| e.clone()) | 225 | self.entries.get(server_url_or_domain).map(|e| e.clone()) |
| 216 | } | 226 | } |
| 217 | 227 | ||
| 218 | /// Remove expired entries from the naughty list | 228 | /// Remove expired entries from the naughty list |
| 219 | /// | 229 | /// |
| 220 | /// Entries older than `expiration_hours` are removed to allow relays | 230 | /// Entries older than `expiration_hours` are removed to allow servers |
| 221 | /// to be retried after infrastructure issues are potentially fixed. | 231 | /// to be retried after infrastructure issues are potentially fixed. |
| 222 | /// | 232 | /// |
| 223 | /// # Returns | 233 | /// # Returns |
| 224 | /// | 234 | /// |
| 225 | /// Vector of relay URLs that were removed from the naughty list | 235 | /// Vector of server URLs/domains that were removed from the naughty list |
| 226 | pub fn expire_old_entries(&self) -> Vec<String> { | 236 | pub fn expire_old_entries(&self) -> Vec<String> { |
| 227 | let now = Instant::now(); | 237 | let now = Instant::now(); |
| 228 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); | 238 | let expiration = std::time::Duration::from_secs(self.expiration_hours * 3600); |
| @@ -242,11 +252,11 @@ impl NaughtyListTracker { | |||
| 242 | expired | 252 | expired |
| 243 | } | 253 | } |
| 244 | 254 | ||
| 245 | /// Get all naughty relays (for metrics and monitoring) | 255 | /// Get all naughty servers (for metrics and monitoring) |
| 246 | /// | 256 | /// |
| 247 | /// # Returns | 257 | /// # Returns |
| 248 | /// | 258 | /// |
| 249 | /// Vector of (relay_url, entry) tuples for all relays currently on the naughty list | 259 | /// Vector of (server_url_or_domain, entry) tuples for all servers currently on the naughty list |
| 250 | pub fn get_all(&self) -> Vec<(String, NaughtyEntry)> { | 260 | pub fn get_all(&self) -> Vec<(String, NaughtyEntry)> { |
| 251 | self.entries | 261 | self.entries |
| 252 | .iter() | 262 | .iter() |
| @@ -254,7 +264,7 @@ impl NaughtyListTracker { | |||
| 254 | .collect() | 264 | .collect() |
| 255 | } | 265 | } |
| 256 | 266 | ||
| 257 | /// Get the count of relays in a specific category | 267 | /// Get the count of servers in a specific category |
| 258 | /// | 268 | /// |
| 259 | /// # Arguments | 269 | /// # Arguments |
| 260 | /// | 270 | /// |
| @@ -262,7 +272,7 @@ impl NaughtyListTracker { | |||
| 262 | /// | 272 | /// |
| 263 | /// # Returns | 273 | /// # Returns |
| 264 | /// | 274 | /// |
| 265 | /// Number of relays in the specified category | 275 | /// Number of servers in the specified category |
| 266 | pub fn count_by_category(&self, category: NaughtyCategory) -> usize { | 276 | pub fn count_by_category(&self, category: NaughtyCategory) -> usize { |
| 267 | self.entries | 277 | self.entries |
| 268 | .iter() | 278 | .iter() |
| @@ -270,7 +280,7 @@ impl NaughtyListTracker { | |||
| 270 | .count() | 280 | .count() |
| 271 | } | 281 | } |
| 272 | 282 | ||
| 273 | /// Get total number of relays on the naughty list | 283 | /// Get total number of servers on the naughty list |
| 274 | pub fn total_count(&self) -> usize { | 284 | pub fn total_count(&self) -> usize { |
| 275 | self.entries.len() | 285 | self.entries.len() |
| 276 | } | 286 | } |