pacsea/sources/news/
fetch.rs

1//! News fetching functionality with HTTP client and error handling.
2
3use crate::sources::news::cache::{ARTICLE_CACHE, ARTICLE_CACHE_TTL_SECONDS, ArticleCacheEntry};
4use crate::sources::news::parse::parse_arch_news_html;
5use crate::sources::news::utils::is_archlinux_url;
6use crate::sources::news::{
7    aur::extract_aur_pkg_from_url,
8    cache::{load_article_entry_from_disk_cache, save_article_to_disk_cache},
9    utils::is_arch_package_url,
10};
11use crate::state::NewsItem;
12use reqwest;
13use std::sync::LazyLock;
14use std::time::{Duration, Instant};
15use tracing::{info, warn};
16
17/// Result type alias for Arch Linux news fetching operations.
18type Result<T> = super::Result<T>;
19
20/// What: Extract cache path from an official package URL.
21///
22/// Inputs:
23/// - `url`: The official package URL.
24///
25/// Output:
26/// - `Some(PathBuf)` if URL is valid; `None` otherwise.
27///
28/// Details:
29/// - Parses URL format: `https://archlinux.org/packages/{repo}/{arch}/{name}/`
30/// - Handles query parameters and fragments in the name.
31fn extract_official_package_cache_path(url: &str) -> Option<std::path::PathBuf> {
32    let lower = url.to_ascii_lowercase();
33    let pos = lower.find("archlinux.org/packages/")?;
34    let after = &url[pos + "archlinux.org/packages/".len()..];
35    let parts: Vec<&str> = after.split('/').filter(|s| !s.is_empty()).collect();
36    if parts.len() >= 3 {
37        let repo = parts[0];
38        let arch = parts[1];
39        let name = parts[2]
40            .split('?')
41            .next()
42            .unwrap_or(parts[2])
43            .split('#')
44            .next()
45            .unwrap_or(parts[2]);
46        Some(crate::sources::official_json_cache_path(repo, arch, name))
47    } else {
48        None
49    }
50}
51
52/// What: Prepend official package JSON changes to content if available.
53///
54/// Inputs:
55/// - `url`: The official package URL.
56/// - `content`: The content to prepend changes to.
57///
58/// Output:
59/// - Content with changes prepended if available; original content otherwise.
60///
61/// Details:
62/// - Only modifies content if changes are detected and not already present.
63fn prepend_official_package_changes(url: &str, content: &str) -> String {
64    let Some(cache_path) = extract_official_package_cache_path(url) else {
65        return content.to_string();
66    };
67
68    let Some(cached_json) = crate::sources::load_official_json_cache(&cache_path) else {
69        return content.to_string();
70    };
71
72    let pkg_obj = cached_json.get("pkg").unwrap_or(&cached_json);
73
74    let Some(pkg_name) = pkg_obj.get("pkgname").and_then(serde_json::Value::as_str) else {
75        return content.to_string();
76    };
77
78    let Some(changes) = crate::sources::get_official_json_changes(pkg_name) else {
79        return content.to_string();
80    };
81
82    if content.starts_with("Changes detected") {
83        content.to_string()
84    } else {
85        format!("{changes}\n\n─── Package Info ───\n\n{content}")
86    }
87}
88
89/// Shared HTTP client with connection pooling for news content fetching.
90/// Connection pooling is enabled by default in `reqwest::Client`.
91/// Uses browser-like headers to work with archlinux.org's `DDoS` protection.
92static HTTP_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
93    use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, HeaderMap, HeaderValue};
94    let mut headers = HeaderMap::new();
95    // Browser-like Accept header
96    headers.insert(
97        ACCEPT,
98        HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
99    );
100    // Accept-Language header for completeness
101    headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.5"));
102    reqwest::Client::builder()
103        .connect_timeout(Duration::from_secs(15))
104        .timeout(Duration::from_secs(30))
105        // Firefox-like User-Agent with Pacsea identifier for transparency
106        .user_agent(format!(
107            "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0 Pacsea/{}",
108            env!("CARGO_PKG_VERSION")
109        ))
110        .default_headers(headers)
111        .build()
112        .expect("Failed to create HTTP client")
113});
114
115/// What: Fetch recent Arch Linux news items with optional early date filtering.
116///
117/// Input:
118/// - `limit`: Maximum number of items to return (best-effort)
119/// - `cutoff_date`: Optional date string (YYYY-MM-DD) for early filtering
120///
121/// Output: `Ok(Vec<NewsItem>)` with date/title/url; `Err` on network or parse failures
122///
123/// # Errors
124/// - Returns `Err` when network request fails (curl execution error)
125/// - Returns `Err` when RSS feed cannot be fetched from Arch Linux website
126/// - Returns `Err` when response body cannot be decoded as UTF-8
127///
128/// Details: Downloads the Arch Linux news RSS feed and iteratively parses `<item>` blocks,
129/// extracting `<title>`, `<link>`, and `<pubDate>`. The `pubDate` value is normalized to a
130/// date-only form via `strip_time_and_tz`. If `cutoff_date` is provided, stops fetching when
131/// items exceed the date limit.
132pub async fn fetch_arch_news(limit: usize, cutoff_date: Option<&str>) -> Result<Vec<NewsItem>> {
133    use crate::sources::news::utils::{extract_between, strip_time_and_tz};
134
135    let url = "https://archlinux.org/feeds/news/";
136    // Use shorter timeout (10s connect, 15s max) to avoid blocking on slow/unreachable servers
137    let body = tokio::task::spawn_blocking(move || {
138        crate::util::curl::curl_text_with_args(
139            url,
140            &["--connect-timeout", "10", "--max-time", "15"],
141        )
142    })
143    .await?
144    .map_err(|e| {
145        warn!(error = %e, "failed to fetch arch news feed");
146        e
147    })?;
148    info!(bytes = body.len(), "fetched arch news feed");
149    let mut items: Vec<NewsItem> = Vec::new();
150    let mut pos = 0;
151    while items.len() < limit {
152        if let Some(start) = body[pos..].find("<item>") {
153            let s = pos + start;
154            let end = body[s..].find("</item>").map_or(body.len(), |e| s + e + 7);
155            let chunk = &body[s..end];
156            let title = extract_between(chunk, "<title>", "</title>").unwrap_or_default();
157            let link = extract_between(chunk, "<link>", "</link>").unwrap_or_default();
158            let raw_date = extract_between(chunk, "<pubDate>", "</pubDate>")
159                .map(|d| d.trim().to_string())
160                .unwrap_or_default();
161            let date = strip_time_and_tz(&raw_date);
162            // Early date filtering: stop if item is older than cutoff_date
163            if let Some(cutoff) = cutoff_date
164                && date.as_str() < cutoff
165            {
166                break;
167            }
168            items.push(NewsItem {
169                date,
170                title,
171                url: link,
172            });
173            pos = end;
174        } else {
175            break;
176        }
177    }
178    info!(count = items.len(), "parsed arch news feed");
179    Ok(items)
180}
181
182/// What: Fetch the full article content from an Arch news URL.
183///
184/// Inputs:
185/// - `url`: The news article URL (e.g., `https://archlinux.org/news/...`)
186///
187/// Output:
188/// - `Ok(String)` with the article text content; `Err` on network/parse failure.
189///
190/// # Errors
191/// - Network fetch failures
192/// - HTML parsing failures
193///
194/// Details:
195/// - For AUR package URLs, fetches and renders AUR comments instead.
196/// - For Arch news URLs, checks cache first (15-minute in-memory, 14-day disk TTL).
197/// - Applies rate limiting for archlinux.org URLs to prevent aggressive fetching.
198/// - Fetches the HTML page and extracts content from the article body.
199/// - Strips HTML tags and normalizes whitespace.
200/// - Caches successful fetches in both in-memory and disk caches.
201pub async fn fetch_news_content(url: &str) -> Result<String> {
202    use crate::sources::news::aur::render_aur_comments;
203
204    if let Some(pkg) = extract_aur_pkg_from_url(url) {
205        // Check for JSON changes first
206        let changes = crate::sources::get_aur_json_changes(&pkg);
207        let comments = crate::sources::fetch_aur_comments(pkg.clone()).await?;
208        let mut rendered = render_aur_comments(&pkg, &comments);
209
210        // Prepend JSON changes if available
211        if let Some(changes_text) = changes {
212            rendered = format!("{changes_text}\n\n─── AUR Comments ───\n\n{rendered}");
213        }
214
215        return Ok(rendered);
216    }
217
218    // Check for official package URL and load cached JSON to get package name and changes
219    if is_arch_package_url(url)
220        && let Ok(cache) = ARTICLE_CACHE.lock()
221        && let Some(entry) = cache.get(url)
222        && entry.timestamp.elapsed().as_secs() < ARTICLE_CACHE_TTL_SECONDS
223    {
224        let content = prepend_official_package_changes(url, &entry.content);
225        return Ok(content);
226    }
227
228    // 1. Check in-memory cache first (fastest, 15-minute TTL)
229    let cached_entry: Option<ArticleCacheEntry> = if let Ok(cache) = ARTICLE_CACHE.lock()
230        && let Some(entry) = cache.get(url)
231        && entry.timestamp.elapsed().as_secs() < ARTICLE_CACHE_TTL_SECONDS
232    {
233        info!(url, "using in-memory cached article content");
234        return Ok(entry.content.clone());
235    } else {
236        None
237    };
238
239    // 2. Check disk cache (14-day TTL) - useful after app restart
240    let disk_entry = load_article_entry_from_disk_cache(url);
241    if let Some(ref entry) = disk_entry {
242        // Populate in-memory cache from disk
243        if let Ok(mut cache) = ARTICLE_CACHE.lock() {
244            cache.insert(
245                url.to_string(),
246                ArticleCacheEntry {
247                    content: entry.content.clone(),
248                    timestamp: Instant::now(),
249                    etag: entry.etag.clone(),
250                    last_modified: entry.last_modified.clone(),
251                },
252            );
253        }
254        // Check for official package changes and prepend if available
255        if is_arch_package_url(url) {
256            let content = prepend_official_package_changes(url, &entry.content);
257            return Ok(content);
258        }
259        return Ok(entry.content.clone());
260    }
261
262    // 3. Check circuit breaker before making request (no network call)
263    let endpoint_pattern = crate::sources::feeds::extract_endpoint_pattern(url);
264    if let Err(e) = crate::sources::feeds::check_circuit_breaker(&endpoint_pattern) {
265        warn!(url, endpoint_pattern, error = %e, "circuit breaker blocking request");
266        // Try to return cached content if available
267        if let Some(cached) = cached_entry {
268            return Ok(cached.content);
269        }
270        if let Some(disk) = disk_entry {
271            return Ok(disk.content);
272        }
273        return Err(e);
274    }
275
276    // 4. Fetch from network with conditional requests
277    // Get cached ETag/Last-Modified for conditional request
278    let cached_etag = cached_entry
279        .as_ref()
280        .and_then(|e: &ArticleCacheEntry| e.etag.as_ref())
281        .or_else(|| disk_entry.as_ref().and_then(|e| e.etag.as_ref()))
282        .cloned();
283    let cached_last_modified = cached_entry
284        .as_ref()
285        .and_then(|e: &ArticleCacheEntry| e.last_modified.as_ref())
286        .or_else(|| disk_entry.as_ref().and_then(|e| e.last_modified.as_ref()))
287        .cloned();
288
289    // Fetch from network
290    let (body, etag, last_modified) =
291        match fetch_from_network(url, cached_etag, cached_last_modified, &endpoint_pattern).await {
292            Ok(result) => result,
293            Err(e) if e.to_string() == "304 Not Modified" => {
294                // Return cached content on 304
295                if let Some(cached) = cached_entry {
296                    return Ok(cached.content);
297                }
298                if let Some(disk) = disk_entry {
299                    return Ok(disk.content);
300                }
301                warn!(url, "304 response but no cached content available");
302                return Err("304 Not Modified but no cache available".into());
303            }
304            Err(e) => return Err(e),
305        };
306
307    // Extract article content from HTML
308    let content = parse_arch_news_html(&body, Some(url));
309
310    // Prepend official package JSON changes if available
311    let content = if is_arch_package_url(url) {
312        prepend_official_package_changes(url, &content)
313    } else {
314        content
315    };
316
317    let parsed_len = content.len();
318    if parsed_len == 0 {
319        warn!(url, "parsed news content is empty");
320    } else {
321        info!(url, parsed_len, "parsed news content");
322    }
323
324    // 5. Cache the result with ETag/Last-Modified
325    // Save to in-memory cache
326    if let Ok(mut cache) = ARTICLE_CACHE.lock() {
327        cache.insert(
328            url.to_string(),
329            ArticleCacheEntry {
330                content: content.clone(),
331                timestamp: Instant::now(),
332                etag: etag.clone(),
333                last_modified: last_modified.clone(),
334            },
335        );
336    }
337    // Save to disk cache for persistence across restarts
338    save_article_to_disk_cache(url, &content, etag, last_modified);
339
340    Ok(content)
341}
342
343/// What: Fetch content from network with conditional requests.
344///
345/// Inputs:
346/// - `url`: The URL to fetch.
347/// - `cached_etag`: Optional `ETag` from cache.
348/// - `cached_last_modified`: Optional `Last-Modified` from cache.
349/// - `endpoint_pattern`: Endpoint pattern for circuit breaker.
350///
351/// Output:
352/// - `Ok((body, etag, last_modified))` on success.
353/// - `Err` on network or HTTP errors.
354///
355/// Details:
356/// - Applies rate limiting for archlinux.org URLs.
357/// - Uses conditional requests if `ETag`/`Last-Modified` available.
358/// - Handles 304 Not Modified responses.
359async fn fetch_from_network(
360    url: &str,
361    cached_etag: Option<String>,
362    cached_last_modified: Option<String>,
363    endpoint_pattern: &str,
364) -> Result<(String, Option<String>, Option<String>)> {
365    // Apply rate limiting and acquire semaphore for archlinux.org URLs
366    let _permit = if is_archlinux_url(url) {
367        Some(crate::sources::feeds::rate_limit_archlinux().await)
368    } else {
369        None
370    };
371
372    // Fetch from network with conditional requests using reqwest (connection pooling)
373    let client = HTTP_CLIENT.clone();
374    let mut request = client.get(url);
375
376    // Add conditional request headers if we have cached ETag/Last-Modified
377    if let Some(ref etag) = cached_etag {
378        request = request.header("If-None-Match", etag);
379    }
380    if let Some(ref last_mod) = cached_last_modified {
381        request = request.header("If-Modified-Since", last_mod);
382    }
383
384    let http_response = request.send().await.map_err(|e| {
385        warn!(error = %e, url, "failed to fetch news content");
386        crate::sources::feeds::record_circuit_breaker_outcome(endpoint_pattern, false);
387        Box::<dyn std::error::Error + Send + Sync>::from(format!("Network error: {e}"))
388    })?;
389
390    let status = http_response.status();
391    let status_code = status.as_u16();
392
393    // Handle 304 Not Modified
394    if status_code == 304 {
395        info!(
396            url,
397            "server returned 304 Not Modified, using cached content"
398        );
399        return Err("304 Not Modified".into());
400    }
401
402    // Extract ETag and Last-Modified from response headers before consuming body
403    let etag = http_response
404        .headers()
405        .get("etag")
406        .and_then(|h| h.to_str().ok())
407        .map(ToString::to_string);
408    let last_modified = http_response
409        .headers()
410        .get("last-modified")
411        .and_then(|h| h.to_str().ok())
412        .map(ToString::to_string);
413
414    // Check for HTTP errors
415    if status.is_client_error() || status.is_server_error() {
416        crate::sources::feeds::record_circuit_breaker_outcome(endpoint_pattern, false);
417        return Err(handle_http_error(status, status_code, &http_response).into());
418    }
419
420    let body = http_response.text().await.map_err(|e| {
421        warn!(error = %e, url, "failed to read response body");
422        Box::<dyn std::error::Error + Send + Sync>::from(format!("Failed to read response: {e}"))
423    })?;
424
425    info!(url, bytes = body.len(), "fetched news page");
426    crate::sources::feeds::record_circuit_breaker_outcome(endpoint_pattern, true);
427
428    Ok((body, etag, last_modified))
429}
430
431/// What: Handle HTTP error responses and format error messages.
432///
433/// Inputs:
434/// - `status`: HTTP status code object.
435/// - `status_code`: HTTP status code as u16.
436/// - `http_response`: HTTP response object to extract headers.
437///
438/// Output:
439/// - Formatted error message string.
440///
441/// Details:
442/// - Handles 429 (Too Many Requests) and 503 (Service Unavailable) with Retry-After headers.
443/// - Formats generic error messages for other HTTP errors.
444fn handle_http_error(
445    status: reqwest::StatusCode,
446    status_code: u16,
447    http_response: &reqwest::Response,
448) -> String {
449    if status_code == 429 {
450        let mut msg = "HTTP 429 Too Many Requests - rate limited by server".to_string();
451        if let Some(retry_after) = http_response.headers().get("retry-after")
452            && let Ok(retry_str) = retry_after.to_str()
453        {
454            msg.push_str(" (Retry-After: ");
455            msg.push_str(retry_str);
456            msg.push(')');
457        }
458        msg
459    } else if status_code == 503 {
460        let mut msg = "HTTP 503 Service Unavailable".to_string();
461        if let Some(retry_after) = http_response.headers().get("retry-after")
462            && let Ok(retry_str) = retry_after.to_str()
463        {
464            msg.push_str(" (Retry-After: ");
465            msg.push_str(retry_str);
466            msg.push(')');
467        }
468        msg
469    } else {
470        format!("HTTP error: {status}")
471    }
472}