pacsea/sources/
comments.rs

1//! AUR package comments fetching via web scraping.
2
3use scraper::{ElementRef, Html, Selector};
4use std::time::Duration;
5use tracing::debug;
6
7use crate::state::types::AurComment;
8
9/// Result type alias for AUR comments fetching operations.
10type Result<T> = super::Result<T>;
11
12/// Context for extracting comment data from HTML elements.
13struct CommentExtractionContext<'a> {
14    /// Parsed HTML document
15    document: &'a Html,
16    /// Selector for date elements
17    date_selector: &'a Selector,
18    /// Package name for URL construction
19    pkgname: &'a str,
20    /// Full HTML text for pinned detection
21    html_text: &'a str,
22    /// Whether pinned section exists
23    has_pinned_section: bool,
24    /// Position of "Latest Comments" heading
25    latest_comments_pos: Option<usize>,
26}
27
28/// What: Fetch AUR package comments by scraping the AUR package page.
29///
30/// Inputs:
31/// - `pkgname`: Package name to fetch comments for.
32///
33/// Output:
34/// - `Ok(Vec<AurComment>)` with parsed comments sorted by date (latest first); `Err` on failure.
35///
36/// # Errors
37/// - Returns `Err` when network request fails
38/// - Returns `Err` when HTML parsing fails
39/// - Returns `Err` when comment extraction fails
40///
41/// # Panics
42/// - Panics if selector parsing fails in fallback path (should not occur with valid selectors)
43///
44/// Details:
45/// - Fetches HTML from `https://aur.archlinux.org/packages/<pkgname>`
46/// - Uses `scraper` to parse HTML and extract comment elements
47/// - Parses dates to Unix timestamps for sorting
48/// - Sorts comments by date descending (latest first)
49/// - Only works for AUR packages
50pub async fn fetch_aur_comments(pkgname: String) -> Result<Vec<AurComment>> {
51    use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, HeaderMap, HeaderValue};
52
53    let url = format!("https://aur.archlinux.org/packages/{pkgname}");
54
55    // Create HTTP client with browser-like headers and reasonable timeout.
56    // Increased from 500ms to 5s to handle archlinux.org's DDoS protection delays.
57    let mut headers = HeaderMap::new();
58    headers.insert(
59        ACCEPT,
60        HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
61    );
62    headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.5"));
63    let client = reqwest::Client::builder()
64        .timeout(Duration::from_secs(5))
65        .user_agent(format!(
66            "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0 Pacsea/{}",
67            env!("CARGO_PKG_VERSION")
68        ))
69        .default_headers(headers)
70        .build()
71        .map_err(|e| format!("Failed to create HTTP client: {e}"))?;
72
73    // Fetch HTML
74    let html_text = client
75        .get(&url)
76        .send()
77        .await
78        .map_err(|e| format!("Network error: {e}"))?
79        .text()
80        .await
81        .map_err(|e| format!("Failed to read response: {e}"))?;
82
83    // Parse HTML
84    let document = Html::parse_document(&html_text);
85
86    // AUR comments structure:
87    // - Each comment has an <h4 class="comment-header"> with author and date
88    // - The content is in a following <div class="article-content"> with id "comment-{id}-content"
89    // - Pinned comments appear before "Latest Comments" heading
90    let comment_header_selector = Selector::parse("h4.comment-header")
91        .map_err(|e| format!("Failed to parse comment header selector: {e}"))?;
92
93    let date_selector =
94        Selector::parse("a.date").map_err(|e| format!("Failed to parse date selector: {e}"))?;
95
96    // Find the "Latest Comments" heading to separate pinned from regular comments
97    // Pinned comments appear before this heading
98    let heading_selector = Selector::parse("h3, h2, h4")
99        .map_err(|e| format!("Failed to parse heading selector: {e}"))?;
100
101    // Check if there's a "Pinned Comments" section
102    let has_pinned_section = document.select(&heading_selector).any(|h| {
103        let text: String = h.text().collect();
104        text.contains("Pinned Comments")
105    });
106
107    // Find the "Latest Comments" heading position in the HTML text
108    // Comments that appear before this in the HTML are pinned
109    let html_text_lower = html_text.to_lowercase();
110    let latest_comments_pos = html_text_lower.find("latest comments");
111
112    // Collect all headers
113    let all_headers: Vec<_> = document.select(&comment_header_selector).collect();
114
115    // Use a HashSet to track seen comment IDs to avoid duplicates
116    let mut seen_comment_ids = std::collections::HashSet::new();
117    let mut comments = Vec::new();
118
119    // Process each header and find its corresponding content by ID
120    for (index, header) in all_headers.iter().enumerate() {
121        // Extract comment ID from header
122        let comment_id = header.value().attr("id");
123
124        // Skip if we've already seen this comment ID (deduplication)
125        if let Some(id) = comment_id
126            && !seen_comment_ids.insert(id)
127        {
128            continue; // Skip duplicate
129        }
130
131        // Extract comment data from header
132        let context = CommentExtractionContext {
133            document: &document,
134            date_selector: &date_selector,
135            pkgname: &pkgname,
136            html_text: &html_text,
137            has_pinned_section,
138            latest_comments_pos,
139        };
140        if let Some(comment) = extract_comment_from_header(header, comment_id, index, &context) {
141            comments.push(comment);
142        }
143    }
144
145    // Separate, sort, and combine comments
146    Ok(separate_and_sort_comments(comments))
147}
148
149/// What: Extract comment data from a header element.
150///
151/// Inputs:
152/// - `header`: Header element containing comment metadata
153/// - `comment_id`: Optional comment ID from header attribute
154/// - `index`: Index of header in collection
155/// - `context`: Extraction context with document, selectors, and metadata
156///
157/// Output:
158/// - `Some(AurComment)` if comment is valid; `None` if empty/invalid
159///
160/// Details:
161/// - Extracts author, date, URL, content, and pinned status
162/// - Skips empty comments with unknown authors
163fn extract_comment_from_header(
164    header: &ElementRef,
165    comment_id: Option<&str>,
166    index: usize,
167    context: &CommentExtractionContext,
168) -> Option<AurComment> {
169    // Extract the full header text to parse author
170    let header_text = header.text().collect::<String>();
171
172    // Extract author: text before " commented on"
173    let author = header_text.find(" commented on ").map_or_else(
174        || {
175            // Fallback: try to find author in links or text nodes
176            header_text
177                .split_whitespace()
178                .next()
179                .unwrap_or("Unknown")
180                .to_string()
181        },
182        |pos| header_text[..pos].trim().to_string(),
183    );
184
185    // Extract date and URL from <a class="date"> inside the header
186    let base_url = format!("https://aur.archlinux.org/packages/{}", context.pkgname);
187    let (date_text, date_url) = header.select(context.date_selector).next().map_or_else(
188        || (String::new(), None),
189        |e| {
190            let text = e.text().collect::<String>().trim().to_string();
191            let url = e.value().attr("href").map(|href| {
192                // Convert relative URLs to absolute
193                if href.starts_with("http://") || href.starts_with("https://") {
194                    href.to_string()
195                } else if href.starts_with('#') {
196                    // Fragment-only URL: combine with package page URL
197                    format!("{base_url}{href}")
198                } else {
199                    // Relative path: prepend AUR domain
200                    format!("https://aur.archlinux.org{href}")
201                }
202            });
203            (text, url)
204        },
205    );
206
207    // Get content by finding the corresponding content div by ID
208    // We extract formatted text to preserve markdown-like structures
209    let comment_content = comment_id
210        .and_then(|id| id.strip_prefix("comment-"))
211        .and_then(|comment_id_str| {
212            Selector::parse(&format!("div#comment-{comment_id_str}-content")).ok()
213        })
214        .and_then(|content_id_selector| context.document.select(&content_id_selector).next())
215        .map_or_else(String::new, |div| {
216            // Parse HTML and extract formatted text
217            // This preserves markdown-like structures (bold, italic, code, links, etc.)
218            html_to_formatted_text(div)
219        });
220
221    // Skip empty comments
222    if comment_content.is_empty() && author == "Unknown" {
223        return None;
224    }
225
226    // Parse date to timestamp
227    let date_timestamp = parse_date_to_timestamp(&date_text);
228    if date_timestamp.is_none() && !date_text.is_empty() {
229        debug!(
230            pkgname = %context.pkgname,
231            author = %author,
232            date_text = %date_text,
233            "Failed to parse comment date to timestamp"
234        );
235    }
236
237    // Convert UTC date to local timezone for display
238    let local_date = convert_utc_to_local_date(&date_text);
239
240    // Determine if this comment is pinned
241    let is_pinned = determine_pinned_status(comment_id, index, context);
242
243    let stable_id = comment_id.map(str::to_string).or_else(|| date_url.clone());
244    Some(AurComment {
245        id: stable_id,
246        author,
247        date: local_date,
248        date_timestamp,
249        date_url,
250        content: comment_content,
251        pinned: is_pinned,
252    })
253}
254
255/// What: Determine if a comment is pinned based on its position in the HTML.
256///
257/// Inputs:
258/// - `comment_id`: Optional comment ID
259/// - `index`: Index of comment in collection
260/// - `context`: Extraction context with HTML text and pinned section info
261///
262/// Output:
263/// - `true` if comment is pinned; `false` otherwise
264///
265/// Details:
266/// - Pinned comments appear before the "Latest Comments" heading
267/// - Uses comment position in HTML relative to "Latest Comments" heading
268fn determine_pinned_status(
269    comment_id: Option<&str>,
270    index: usize,
271    context: &CommentExtractionContext,
272) -> bool {
273    if !context.has_pinned_section {
274        return false;
275    }
276
277    let Some(latest_pos) = context.latest_comments_pos else {
278        return false;
279    };
280
281    comment_id.map_or(index < 10, |id| {
282        context
283            .html_text
284            .find(id)
285            .map_or(index < 10, |comment_pos| comment_pos < latest_pos)
286    })
287}
288
289/// What: Separate pinned and regular comments, sort them, and combine.
290///
291/// Inputs:
292/// - `comments`: Vector of all comments
293///
294/// Output:
295/// - Vector with pinned comments first, then regular, both sorted by date descending
296///
297/// Details:
298/// - Separates comments into pinned and regular
299/// - Sorts each group by date descending (latest first)
300/// - Combines with pinned first
301fn separate_and_sort_comments(comments: Vec<AurComment>) -> Vec<AurComment> {
302    // Separate pinned and regular comments
303    let mut pinned_comments: Vec<AurComment> =
304        comments.iter().filter(|c| c.pinned).cloned().collect();
305    let mut regular_comments: Vec<AurComment> =
306        comments.into_iter().filter(|c| !c.pinned).collect();
307
308    // Sort both groups by date descending
309    sort_comments_by_date(&mut pinned_comments);
310    sort_comments_by_date(&mut regular_comments);
311
312    // Combine: pinned first, then regular
313    pinned_comments.extend(regular_comments);
314    pinned_comments
315}
316
317/// What: Sort comments by date descending (latest first).
318///
319/// Inputs:
320/// - `comments`: Mutable reference to comments vector to sort
321///
322/// Output:
323/// - Comments are sorted in-place by date descending
324///
325/// Details:
326/// - Uses timestamp for sorting if available
327/// - Falls back to string comparison if timestamp is missing
328fn sort_comments_by_date(comments: &mut [AurComment]) {
329    comments.sort_by(|a, b| {
330        match (a.date_timestamp, b.date_timestamp) {
331            (Some(ts_a), Some(ts_b)) => ts_b.cmp(&ts_a), // Descending order
332            (Some(_), None) => std::cmp::Ordering::Less,
333            (None, Some(_)) => std::cmp::Ordering::Greater,
334            (None, None) => b.date.cmp(&a.date), // Fallback to string comparison
335        }
336    });
337}
338
339/// What: Convert UTC date string from AUR to local timezone string.
340///
341/// Inputs:
342/// - `utc_date_str`: UTC date string from AUR page (e.g., "2025-05-15 03:55 (UTC)").
343///
344/// Output:
345/// - Local timezone date string formatted as "YYYY-MM-DD HH:MM (TZ)" where TZ is local timezone abbreviation.
346/// - Returns original string if parsing fails.
347///
348/// Details:
349/// - Parses UTC date from AUR format
350/// - Converts to local timezone using system timezone
351/// - Formats with local timezone abbreviation
352fn convert_utc_to_local_date(utc_date_str: &str) -> String {
353    let utc_date_str = utc_date_str.trim();
354
355    // AUR format: "YYYY-MM-DD HH:MM (UTC)" or "YYYY-MM-DD HH:MM (CEST)" etc.
356    // Try to parse the date/time part before the timezone
357    if let Some(tz_start) = utc_date_str.rfind('(') {
358        let date_time_part = utc_date_str[..tz_start].trim();
359
360        // Try parsing "YYYY-MM-DD HH:MM" format as UTC
361        if let Ok(naive_dt) =
362            chrono::NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M")
363        {
364            // Treat as UTC and convert to local timezone
365            let utc_dt = naive_dt.and_utc();
366            let local_dt = utc_dt.with_timezone(&chrono::Local);
367
368            // Format with local timezone
369            // Format: "YYYY-MM-DD HH:MM (TZ)"
370            let formatted = local_dt.format("%Y-%m-%d %H:%M");
371
372            // Get timezone abbreviation
373            // Try multiple methods to get the actual timezone name (CEST, CET, etc.)
374            let tz_abbr = get_timezone_abbreviation(&local_dt);
375
376            return format!("{formatted} ({tz_abbr})");
377        }
378    }
379
380    // If parsing fails, return original string
381    utc_date_str.to_string()
382}
383
384/// What: Get timezone abbreviation (CEST, CET, PST, etc.) for a local datetime.
385///
386/// Inputs:
387/// - `local_dt`: Local datetime to get timezone for.
388///
389/// Output:
390/// - Timezone abbreviation string (e.g., "CEST", "CET", "UTC+2").
391///
392/// Details:
393/// - First tries chrono's %Z format specifier
394/// - Falls back to TZ environment variable parsing
395/// - Finally falls back to UTC offset format
396fn get_timezone_abbreviation(local_dt: &chrono::DateTime<chrono::Local>) -> String {
397    // Try chrono's %Z format specifier first
398    let tz_from_format = local_dt.format("%Z").to_string();
399
400    // Check if %Z gave us a valid abbreviation (3-6 chars, alphabetic)
401    if !tz_from_format.is_empty()
402        && tz_from_format.len() >= 3
403        && tz_from_format.len() <= 6
404        && tz_from_format.chars().all(char::is_alphabetic)
405        && !tz_from_format.starts_with("UTC")
406    {
407        return tz_from_format;
408    }
409
410    // Try to get timezone from TZ environment variable
411    if let Ok(tz_env) = std::env::var("TZ") {
412        // Extract timezone abbreviation from TZ variable
413        // TZ can be like "Europe/Berlin" or "CEST-2" or just "CEST"
414        if let Some(tz_name) = tz_env.rsplit('/').next() {
415            // Check if it looks like a timezone abbreviation (3-6 uppercase letters)
416            if tz_name.len() >= 3
417                && tz_name.len() <= 6
418                && tz_name.chars().all(|c| c.is_uppercase() || c == '-')
419            {
420                // Extract just the abbreviation part (before any offset)
421                let abbr = tz_name.split('-').next().unwrap_or(tz_name);
422                if abbr.len() >= 3 && abbr.chars().all(char::is_alphabetic) {
423                    return abbr.to_string();
424                }
425            }
426        }
427    }
428
429    // Fallback: Try to determine timezone abbreviation from offset and date
430    let offset_secs = local_dt.offset().local_minus_utc();
431    let hours = offset_secs / 3600;
432    let minutes = (offset_secs.abs() % 3600) / 60;
433
434    // Try to get timezone abbreviation from common mappings based on offset
435    if let Some(tz_abbr) = get_tz_abbr_from_offset(hours, local_dt.date_naive()) {
436        return tz_abbr;
437    }
438
439    // Final fallback: Use UTC offset format
440    if offset_secs == 0 {
441        "UTC".to_string()
442    } else if minutes == 0 {
443        format!("UTC{hours:+}")
444    } else {
445        format!("UTC{hours:+}:{minutes:02}")
446    }
447}
448
449/// What: Get timezone abbreviation from UTC offset and date.
450///
451/// Inputs:
452/// - `offset_hours`: UTC offset in hours (e.g., 1, 2, -5).
453/// - `date`: Date (unused, kept for API compatibility).
454///
455/// Output:
456/// - `Some(String)` with timezone abbreviation if unambiguous; `None` otherwise.
457///
458/// Details:
459/// - Returns `None` for DST-affected timezones to avoid incorrect abbreviations
460/// - DST transition dates vary by year and region (e.g., US: second Sunday in March, first Sunday in November)
461/// - Month-based DST detection is inaccurate and can show wrong abbreviations near transitions
462/// - When `None` is returned, the caller falls back to UTC offset format (e.g., "UTC-5")
463/// - Only returns `Some` for unambiguous timezones like UTC
464fn get_tz_abbr_from_offset(offset_hours: i32, _date: chrono::NaiveDate) -> Option<String> {
465    // Only return abbreviations for unambiguous timezones
466    // For DST-affected timezones, return None to use UTC offset format instead
467    // This avoids incorrect abbreviations near DST transition dates
468    match offset_hours {
469        0 => Some("UTC".to_string()),
470        _ => None, // Return None for all other offsets to use UTC offset format
471    }
472}
473
474/// What: Parse a date string to Unix timestamp.
475///
476/// Inputs:
477/// - `date_str`: Date string from AUR page (e.g., "2025-05-15 03:55 (UTC)").
478///
479/// Output:
480/// - `Some(i64)` with Unix timestamp if parsing succeeds; `None` otherwise.
481///
482/// Details:
483/// - Attempts to parse common AUR date formats and many other common formats
484/// - AUR uses format: "YYYY-MM-DD HH:MM (TZ)" where TZ is timezone abbreviation
485/// - Supports ISO 8601, RFC 2822, RFC 3339, and various date separator formats
486/// - Returns None if parsing fails (will use string comparison for sorting)
487/// - Logs debug information when parsing fails to help diagnose issues
488fn parse_date_to_timestamp(date_str: &str) -> Option<i64> {
489    let date_str = date_str.trim();
490
491    // Skip empty strings early
492    if date_str.is_empty() {
493        debug!("Failed to parse empty date string");
494        return None;
495    }
496
497    // AUR format: "YYYY-MM-DD HH:MM (UTC)" or "YYYY-MM-DD HH:MM (CEST)" etc.
498    // Try to parse the date/time part before the timezone
499    if let Some(tz_start) = date_str.rfind('(') {
500        let date_time_part = date_str[..tz_start].trim();
501
502        // Try parsing "YYYY-MM-DD HH:MM" format
503        if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M") {
504            // AUR dates are in UTC, so we can treat them as UTC
505            return dt.and_utc().timestamp().into();
506        }
507
508        // Try with seconds: "YYYY-MM-DD HH:MM:SS"
509        if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M:%S") {
510            return dt.and_utc().timestamp().into();
511        }
512    }
513
514    // Try ISO 8601-like format: "YYYY-MM-DD HH:MM:SS"
515    if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%Y-%m-%d %H:%M:%S") {
516        return dt.and_utc().timestamp().into();
517    }
518
519    // Try ISO 8601 format: "YYYY-MM-DDTHH:MM:SS" (with T separator)
520    if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S") {
521        return dt.and_utc().timestamp().into();
522    }
523
524    // Try ISO 8601 with timezone: "YYYY-MM-DDTHH:MM:SSZ" or "YYYY-MM-DDTHH:MM:SS+HH:MM"
525    if let Ok(dt) = chrono::DateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S%z") {
526        return Some(dt.timestamp());
527    }
528
529    // Try date-only format: "YYYY-MM-DD"
530    if let Ok(d) = chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d")
531        && let Some(dt) = d.and_hms_opt(0, 0, 0)
532    {
533        return dt.and_utc().timestamp().into();
534    }
535
536    // Try RFC 2822 format (e.g., "Mon, 15 May 2025 03:55:00 +0000")
537    if let Ok(dt) = chrono::DateTime::parse_from_rfc2822(date_str) {
538        return Some(dt.timestamp());
539    }
540
541    // Try RFC 3339 format (e.g., "2025-05-15T03:55:00Z")
542    if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(date_str) {
543        return Some(dt.timestamp());
544    }
545
546    // Try formats with different separators: "YYYY/MM/DD HH:MM"
547    if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%Y/%m/%d %H:%M") {
548        return dt.and_utc().timestamp().into();
549    }
550
551    // Try formats with different separators: "DD.MM.YYYY HH:MM"
552    if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%d.%m.%Y %H:%M") {
553        return dt.and_utc().timestamp().into();
554    }
555
556    // Try formats with different separators: "MM/DD/YYYY HH:MM"
557    if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%m/%d/%Y %H:%M") {
558        return dt.and_utc().timestamp().into();
559    }
560
561    // Try Unix timestamp as string
562    if let Ok(ts) = date_str.parse::<i64>() {
563        // Validate it's a reasonable timestamp (between 2000 and 2100)
564        if ts > 946_684_800 && ts < 4_102_444_800 {
565            return Some(ts);
566        }
567    }
568
569    // All parsing attempts failed - log for debugging
570    debug!(
571        date_str = %date_str,
572        date_str_len = date_str.len(),
573        "Failed to parse date string to timestamp"
574    );
575    None
576}
577
578/// What: Convert HTML content to formatted text preserving markdown-like structures.
579///
580/// Inputs:
581/// - `element`: HTML element to parse
582///
583/// Output:
584/// - Formatted text string with markdown-like syntax for bold, italic, code, etc.
585///
586/// Details:
587/// - Converts HTML tags to markdown-like syntax:
588///   - `<strong>`, `<b>` → `**text**`
589///   - `<em>`, `<i>` → `*text*`
590///   - `<code>` → `` `text` ``
591///   - `<pre>` → preserves code blocks with triple backticks
592///   - `<a>` → preserves links as `[text](url)`
593///   - `<p>` → newlines between paragraphs
594fn html_to_formatted_text(element: ElementRef) -> String {
595    let mut result = String::new();
596
597    // Process paragraphs to preserve structure
598    let p_selector = Selector::parse("p").ok();
599    if let Some(ref p_sel) = p_selector {
600        let paragraphs: Vec<_> = element.select(p_sel).collect();
601        if !paragraphs.is_empty() {
602            for (i, para) in paragraphs.iter().enumerate() {
603                if i > 0 {
604                    result.push('\n');
605                }
606                result.push_str(&convert_element_to_markdown(para));
607            }
608            return result.trim().to_string();
609        }
610    }
611
612    // If no paragraphs, process the whole element
613    result = convert_element_to_markdown(&element);
614    result.trim().to_string()
615}
616
617/// Convert an HTML element to markdown-like syntax by processing nested elements.
618fn convert_element_to_markdown(element: &ElementRef) -> String {
619    let html = element.html();
620    let mut working_html = html;
621
622    // Process <pre> blocks first (code blocks)
623    let pre_selector = Selector::parse("pre").ok();
624    if let Some(ref pre_sel) = pre_selector {
625        for pre in element.select(pre_sel) {
626            let text = pre.text().collect::<String>();
627            let pre_html = pre.html();
628            let replacement = format!("```\n{}\n```", text.trim());
629            working_html = working_html.replace(&pre_html, &replacement);
630        }
631    }
632
633    // Process <a> tags (links)
634    let a_selector = Selector::parse("a").ok();
635    if let Some(ref a_sel) = a_selector {
636        for link in element.select(a_sel) {
637            let text = link.text().collect::<String>().trim().to_string();
638            if let Some(href) = link.value().attr("href") {
639                let link_html = link.html();
640                let replacement = format!("[{text}]({href})");
641                working_html = working_html.replace(&link_html, &replacement);
642            }
643        }
644    }
645
646    // Process <strong> and <b> tags (bold)
647    let strong_selector = Selector::parse("strong, b").ok();
648    if let Some(ref strong_sel) = strong_selector {
649        for bold in element.select(strong_sel) {
650            let text = bold.text().collect::<String>().trim().to_string();
651            if !text.is_empty() {
652                let bold_html = bold.html();
653                let replacement = format!("**{text}**");
654                working_html = working_html.replace(&bold_html, &replacement);
655            }
656        }
657    }
658
659    // Process <em> and <i> tags (italic)
660    let em_selector = Selector::parse("em, i").ok();
661    if let Some(ref em_sel) = em_selector {
662        for italic in element.select(em_sel) {
663            let text = italic.text().collect::<String>().trim().to_string();
664            if !text.is_empty() {
665                let italic_html = italic.html();
666                let replacement = format!("*{text}*");
667                working_html = working_html.replace(&italic_html, &replacement);
668            }
669        }
670    }
671
672    // Process <code> tags
673    let code_selector = Selector::parse("code").ok();
674    if let Some(ref code_sel) = code_selector {
675        for code in element.select(code_sel) {
676            let text = code.text().collect::<String>().trim().to_string();
677            if !text.is_empty() {
678                let code_html = code.html();
679                let replacement = format!("`{text}`");
680                working_html = working_html.replace(&code_html, &replacement);
681            }
682        }
683    }
684
685    // Parse the modified HTML and extract text (this removes remaining HTML tags)
686    let temp_doc = Html::parse_fragment(&working_html);
687    let mut result = temp_doc.root_element().text().collect::<String>();
688
689    // Decode HTML entities
690    result = result
691        .replace("&lt;", "<")
692        .replace("&gt;", ">")
693        .replace("&amp;", "&")
694        .replace("&quot;", "\"")
695        .replace("&#39;", "'")
696        .replace("&nbsp;", " ");
697
698    result
699}
700
701#[cfg(test)]
702mod tests {
703    use super::*;
704
705    /// What: Test that DST-affected timezones return None to use UTC offset format.
706    ///
707    /// Inputs:
708    /// - Various dates and offsets for DST-affected timezones
709    ///
710    /// Output:
711    /// - Function should return None to fall back to UTC offset format
712    ///
713    /// Details:
714    /// - DST transition dates vary by year and region
715    /// - US DST: second Sunday in March to first Sunday in November
716    /// - Month-based detection is inaccurate, so we use UTC offset format instead
717    #[test]
718    fn test_dst_affected_timezones_return_none() {
719        // Test various dates that would be incorrectly handled by month-based DST detection
720        let test_cases = vec![
721            (
722                chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
723                -5,
724            ), // Early March (before DST starts)
725            (
726                chrono::NaiveDate::from_ymd_opt(2024, 3, 15).expect("valid test date"),
727                -5,
728            ), // Mid March (after DST starts)
729            (
730                chrono::NaiveDate::from_ymd_opt(2024, 10, 31).expect("valid test date"),
731                -5,
732            ), // Late October (DST still active)
733            (
734                chrono::NaiveDate::from_ymd_opt(2024, 11, 4).expect("valid test date"),
735                -5,
736            ), // Early November (after DST ends)
737            (
738                chrono::NaiveDate::from_ymd_opt(2024, 11, 15).expect("valid test date"),
739                -5,
740            ), // Mid November (after DST ends)
741            // Test other US timezones
742            (
743                chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
744                -6,
745            ), // Central Time
746            (
747                chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
748                -7,
749            ), // Mountain Time
750            (
751                chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
752                -8,
753            ), // Pacific Time
754            // Test European timezones
755            (
756                chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
757                1,
758            ), // CET/CEST
759            (
760                chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
761                2,
762            ), // CEST/EET
763        ];
764
765        for (date, offset) in test_cases {
766            let result = get_tz_abbr_from_offset(offset, date);
767            // Should return None to use UTC offset format
768            // This is safer than guessing DST based on month ranges
769            assert!(
770                result.is_none(),
771                "Should return None for DST-affected timezones to use UTC offset format. Date: {date:?}, Offset: {offset}, Got: {result:?}"
772            );
773        }
774    }
775
776    /// What: Test that UTC (offset 0) returns the correct abbreviation.
777    ///
778    /// Inputs:
779    /// - Offset: 0 (UTC)
780    /// - Various dates
781    ///
782    /// Output:
783    /// - Should return "UTC" since it's unambiguous
784    ///
785    /// Details:
786    /// - UTC is not affected by DST, so it's safe to return the abbreviation
787    #[test]
788    fn test_utc_returns_abbreviation() {
789        let test_dates = vec![
790            chrono::NaiveDate::from_ymd_opt(2024, 1, 1).expect("valid test date"),
791            chrono::NaiveDate::from_ymd_opt(2024, 6, 15).expect("valid test date"),
792            chrono::NaiveDate::from_ymd_opt(2024, 12, 31).expect("valid test date"),
793        ];
794
795        for date in test_dates {
796            let result = get_tz_abbr_from_offset(0, date);
797            assert_eq!(
798                result,
799                Some("UTC".to_string()),
800                "UTC should always return 'UTC' abbreviation. Date: {date:?}, Got: {result:?}"
801            );
802        }
803    }
804
805    /// What: Test date parsing with various AUR date formats.
806    ///
807    /// Inputs:
808    /// - Various date string formats that might come from AUR
809    ///
810    /// Output:
811    /// - Should successfully parse valid AUR date formats
812    ///
813    /// Details:
814    /// - Tests common AUR date formats including UTC+2 format
815    #[test]
816    fn test_parse_date_to_timestamp() {
817        // Test standard AUR formats
818        assert!(
819            parse_date_to_timestamp("2025-04-14 11:52 (UTC)").is_some(),
820            "Should parse UTC format"
821        );
822        assert!(
823            parse_date_to_timestamp("2025-04-14 11:52 (CEST)").is_some(),
824            "Should parse CEST format"
825        );
826        assert!(
827            parse_date_to_timestamp("2025-04-14 11:52 (UTC+2)").is_some(),
828            "Should parse UTC+2 format"
829        );
830        assert!(
831            parse_date_to_timestamp("2024-12-01 10:00 (UTC)").is_some(),
832            "Should parse December date"
833        );
834
835        // Test edge cases
836        assert!(
837            parse_date_to_timestamp("").is_none(),
838            "Empty string should return None"
839        );
840        assert!(
841            parse_date_to_timestamp("invalid date").is_none(),
842            "Invalid date should return None"
843        );
844
845        // Test ISO 8601 formats
846        assert!(
847            parse_date_to_timestamp("2025-04-14 11:52:30").is_some(),
848            "Should parse ISO 8601-like format with seconds"
849        );
850        assert!(
851            parse_date_to_timestamp("2025-04-14T11:52:30").is_some(),
852            "Should parse ISO 8601 format with T separator"
853        );
854
855        // Test date-only format
856        assert!(
857            parse_date_to_timestamp("2025-04-14").is_some(),
858            "Should parse date-only format"
859        );
860
861        // Test alternative separator formats
862        assert!(
863            parse_date_to_timestamp("2025/04/14 11:52").is_some(),
864            "Should parse format with / separators"
865        );
866        assert!(
867            parse_date_to_timestamp("14.04.2025 11:52").is_some(),
868            "Should parse DD.MM.YYYY format"
869        );
870        assert!(
871            parse_date_to_timestamp("04/14/2025 11:52").is_some(),
872            "Should parse MM/DD/YYYY format"
873        );
874
875        // Test Unix timestamp as string
876        assert!(
877            parse_date_to_timestamp("1735689600").is_some(),
878            "Should parse Unix timestamp string"
879        );
880
881        // Verify the parsed timestamp is reasonable
882        if let Some(ts) = parse_date_to_timestamp("2025-04-14 11:52 (UTC)") {
883            // April 14, 2025 should be a valid future timestamp
884            assert!(ts > 0, "Timestamp should be positive");
885        }
886
887        // Verify timestamps are consistent across formats
888        let ts1 = parse_date_to_timestamp("2025-04-14 11:52 (UTC)");
889        let ts2 = parse_date_to_timestamp("2025-04-14 11:52:00");
890        assert_eq!(
891            ts1, ts2,
892            "Same date/time should produce same timestamp regardless of format"
893        );
894    }
895}
pacsea/sources/comments.rs

pacsea/sources/
comments.rs