pacsea/sources/comments.rs
1//! AUR package comments fetching via web scraping.
2
3use scraper::{ElementRef, Html, Selector};
4use std::time::Duration;
5use tracing::debug;
6
7use crate::state::types::AurComment;
8
9/// Result type alias for AUR comments fetching operations.
10type Result<T> = super::Result<T>;
11
12/// Context for extracting comment data from HTML elements.
13struct CommentExtractionContext<'a> {
14 /// Parsed HTML document
15 document: &'a Html,
16 /// Selector for date elements
17 date_selector: &'a Selector,
18 /// Package name for URL construction
19 pkgname: &'a str,
20 /// Full HTML text for pinned detection
21 html_text: &'a str,
22 /// Whether pinned section exists
23 has_pinned_section: bool,
24 /// Position of "Latest Comments" heading
25 latest_comments_pos: Option<usize>,
26}
27
28/// What: Fetch AUR package comments by scraping the AUR package page.
29///
30/// Inputs:
31/// - `pkgname`: Package name to fetch comments for.
32///
33/// Output:
34/// - `Ok(Vec<AurComment>)` with parsed comments sorted by date (latest first); `Err` on failure.
35///
36/// # Errors
37/// - Returns `Err` when network request fails
38/// - Returns `Err` when HTML parsing fails
39/// - Returns `Err` when comment extraction fails
40///
41/// # Panics
42/// - Panics if selector parsing fails in fallback path (should not occur with valid selectors)
43///
44/// Details:
45/// - Fetches HTML from `https://aur.archlinux.org/packages/<pkgname>`
46/// - Uses `scraper` to parse HTML and extract comment elements
47/// - Parses dates to Unix timestamps for sorting
48/// - Sorts comments by date descending (latest first)
49/// - Only works for AUR packages
50pub async fn fetch_aur_comments(pkgname: String) -> Result<Vec<AurComment>> {
51 use reqwest::header::{ACCEPT, ACCEPT_LANGUAGE, HeaderMap, HeaderValue};
52
53 let url = format!("https://aur.archlinux.org/packages/{pkgname}");
54
55 // Create HTTP client with browser-like headers and reasonable timeout.
56 // Increased from 500ms to 5s to handle archlinux.org's DDoS protection delays.
57 let mut headers = HeaderMap::new();
58 headers.insert(
59 ACCEPT,
60 HeaderValue::from_static("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
61 );
62 headers.insert(ACCEPT_LANGUAGE, HeaderValue::from_static("en-US,en;q=0.5"));
63 let client = reqwest::Client::builder()
64 .timeout(Duration::from_secs(5))
65 .user_agent(format!(
66 "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0 Pacsea/{}",
67 env!("CARGO_PKG_VERSION")
68 ))
69 .default_headers(headers)
70 .build()
71 .map_err(|e| format!("Failed to create HTTP client: {e}"))?;
72
73 // Fetch HTML
74 let html_text = client
75 .get(&url)
76 .send()
77 .await
78 .map_err(|e| format!("Network error: {e}"))?
79 .text()
80 .await
81 .map_err(|e| format!("Failed to read response: {e}"))?;
82
83 // Parse HTML
84 let document = Html::parse_document(&html_text);
85
86 // AUR comments structure:
87 // - Each comment has an <h4 class="comment-header"> with author and date
88 // - The content is in a following <div class="article-content"> with id "comment-{id}-content"
89 // - Pinned comments appear before "Latest Comments" heading
90 let comment_header_selector = Selector::parse("h4.comment-header")
91 .map_err(|e| format!("Failed to parse comment header selector: {e}"))?;
92
93 let date_selector =
94 Selector::parse("a.date").map_err(|e| format!("Failed to parse date selector: {e}"))?;
95
96 // Find the "Latest Comments" heading to separate pinned from regular comments
97 // Pinned comments appear before this heading
98 let heading_selector = Selector::parse("h3, h2, h4")
99 .map_err(|e| format!("Failed to parse heading selector: {e}"))?;
100
101 // Check if there's a "Pinned Comments" section
102 let has_pinned_section = document.select(&heading_selector).any(|h| {
103 let text: String = h.text().collect();
104 text.contains("Pinned Comments")
105 });
106
107 // Find the "Latest Comments" heading position in the HTML text
108 // Comments that appear before this in the HTML are pinned
109 let html_text_lower = html_text.to_lowercase();
110 let latest_comments_pos = html_text_lower.find("latest comments");
111
112 // Collect all headers
113 let all_headers: Vec<_> = document.select(&comment_header_selector).collect();
114
115 // Use a HashSet to track seen comment IDs to avoid duplicates
116 let mut seen_comment_ids = std::collections::HashSet::new();
117 let mut comments = Vec::new();
118
119 // Process each header and find its corresponding content by ID
120 for (index, header) in all_headers.iter().enumerate() {
121 // Extract comment ID from header
122 let comment_id = header.value().attr("id");
123
124 // Skip if we've already seen this comment ID (deduplication)
125 if let Some(id) = comment_id
126 && !seen_comment_ids.insert(id)
127 {
128 continue; // Skip duplicate
129 }
130
131 // Extract comment data from header
132 let context = CommentExtractionContext {
133 document: &document,
134 date_selector: &date_selector,
135 pkgname: &pkgname,
136 html_text: &html_text,
137 has_pinned_section,
138 latest_comments_pos,
139 };
140 if let Some(comment) = extract_comment_from_header(header, comment_id, index, &context) {
141 comments.push(comment);
142 }
143 }
144
145 // Separate, sort, and combine comments
146 Ok(separate_and_sort_comments(comments))
147}
148
149/// What: Extract comment data from a header element.
150///
151/// Inputs:
152/// - `header`: Header element containing comment metadata
153/// - `comment_id`: Optional comment ID from header attribute
154/// - `index`: Index of header in collection
155/// - `context`: Extraction context with document, selectors, and metadata
156///
157/// Output:
158/// - `Some(AurComment)` if comment is valid; `None` if empty/invalid
159///
160/// Details:
161/// - Extracts author, date, URL, content, and pinned status
162/// - Skips empty comments with unknown authors
163fn extract_comment_from_header(
164 header: &ElementRef,
165 comment_id: Option<&str>,
166 index: usize,
167 context: &CommentExtractionContext,
168) -> Option<AurComment> {
169 // Extract the full header text to parse author
170 let header_text = header.text().collect::<String>();
171
172 // Extract author: text before " commented on"
173 let author = header_text.find(" commented on ").map_or_else(
174 || {
175 // Fallback: try to find author in links or text nodes
176 header_text
177 .split_whitespace()
178 .next()
179 .unwrap_or("Unknown")
180 .to_string()
181 },
182 |pos| header_text[..pos].trim().to_string(),
183 );
184
185 // Extract date and URL from <a class="date"> inside the header
186 let base_url = format!("https://aur.archlinux.org/packages/{}", context.pkgname);
187 let (date_text, date_url) = header.select(context.date_selector).next().map_or_else(
188 || (String::new(), None),
189 |e| {
190 let text = e.text().collect::<String>().trim().to_string();
191 let url = e.value().attr("href").map(|href| {
192 // Convert relative URLs to absolute
193 if href.starts_with("http://") || href.starts_with("https://") {
194 href.to_string()
195 } else if href.starts_with('#') {
196 // Fragment-only URL: combine with package page URL
197 format!("{base_url}{href}")
198 } else {
199 // Relative path: prepend AUR domain
200 format!("https://aur.archlinux.org{href}")
201 }
202 });
203 (text, url)
204 },
205 );
206
207 // Get content by finding the corresponding content div by ID
208 // We extract formatted text to preserve markdown-like structures
209 let comment_content = comment_id
210 .and_then(|id| id.strip_prefix("comment-"))
211 .and_then(|comment_id_str| {
212 Selector::parse(&format!("div#comment-{comment_id_str}-content")).ok()
213 })
214 .and_then(|content_id_selector| context.document.select(&content_id_selector).next())
215 .map_or_else(String::new, |div| {
216 // Parse HTML and extract formatted text
217 // This preserves markdown-like structures (bold, italic, code, links, etc.)
218 html_to_formatted_text(div)
219 });
220
221 // Skip empty comments
222 if comment_content.is_empty() && author == "Unknown" {
223 return None;
224 }
225
226 // Parse date to timestamp
227 let date_timestamp = parse_date_to_timestamp(&date_text);
228 if date_timestamp.is_none() && !date_text.is_empty() {
229 debug!(
230 pkgname = %context.pkgname,
231 author = %author,
232 date_text = %date_text,
233 "Failed to parse comment date to timestamp"
234 );
235 }
236
237 // Convert UTC date to local timezone for display
238 let local_date = convert_utc_to_local_date(&date_text);
239
240 // Determine if this comment is pinned
241 let is_pinned = determine_pinned_status(comment_id, index, context);
242
243 let stable_id = comment_id.map(str::to_string).or_else(|| date_url.clone());
244 Some(AurComment {
245 id: stable_id,
246 author,
247 date: local_date,
248 date_timestamp,
249 date_url,
250 content: comment_content,
251 pinned: is_pinned,
252 })
253}
254
255/// What: Determine if a comment is pinned based on its position in the HTML.
256///
257/// Inputs:
258/// - `comment_id`: Optional comment ID
259/// - `index`: Index of comment in collection
260/// - `context`: Extraction context with HTML text and pinned section info
261///
262/// Output:
263/// - `true` if comment is pinned; `false` otherwise
264///
265/// Details:
266/// - Pinned comments appear before the "Latest Comments" heading
267/// - Uses comment position in HTML relative to "Latest Comments" heading
268fn determine_pinned_status(
269 comment_id: Option<&str>,
270 index: usize,
271 context: &CommentExtractionContext,
272) -> bool {
273 if !context.has_pinned_section {
274 return false;
275 }
276
277 let Some(latest_pos) = context.latest_comments_pos else {
278 return false;
279 };
280
281 comment_id.map_or(index < 10, |id| {
282 context
283 .html_text
284 .find(id)
285 .map_or(index < 10, |comment_pos| comment_pos < latest_pos)
286 })
287}
288
289/// What: Separate pinned and regular comments, sort them, and combine.
290///
291/// Inputs:
292/// - `comments`: Vector of all comments
293///
294/// Output:
295/// - Vector with pinned comments first, then regular, both sorted by date descending
296///
297/// Details:
298/// - Separates comments into pinned and regular
299/// - Sorts each group by date descending (latest first)
300/// - Combines with pinned first
301fn separate_and_sort_comments(comments: Vec<AurComment>) -> Vec<AurComment> {
302 // Separate pinned and regular comments
303 let mut pinned_comments: Vec<AurComment> =
304 comments.iter().filter(|c| c.pinned).cloned().collect();
305 let mut regular_comments: Vec<AurComment> =
306 comments.into_iter().filter(|c| !c.pinned).collect();
307
308 // Sort both groups by date descending
309 sort_comments_by_date(&mut pinned_comments);
310 sort_comments_by_date(&mut regular_comments);
311
312 // Combine: pinned first, then regular
313 pinned_comments.extend(regular_comments);
314 pinned_comments
315}
316
317/// What: Sort comments by date descending (latest first).
318///
319/// Inputs:
320/// - `comments`: Mutable reference to comments vector to sort
321///
322/// Output:
323/// - Comments are sorted in-place by date descending
324///
325/// Details:
326/// - Uses timestamp for sorting if available
327/// - Falls back to string comparison if timestamp is missing
328fn sort_comments_by_date(comments: &mut [AurComment]) {
329 comments.sort_by(|a, b| {
330 match (a.date_timestamp, b.date_timestamp) {
331 (Some(ts_a), Some(ts_b)) => ts_b.cmp(&ts_a), // Descending order
332 (Some(_), None) => std::cmp::Ordering::Less,
333 (None, Some(_)) => std::cmp::Ordering::Greater,
334 (None, None) => b.date.cmp(&a.date), // Fallback to string comparison
335 }
336 });
337}
338
339/// What: Convert UTC date string from AUR to local timezone string.
340///
341/// Inputs:
342/// - `utc_date_str`: UTC date string from AUR page (e.g., "2025-05-15 03:55 (UTC)").
343///
344/// Output:
345/// - Local timezone date string formatted as "YYYY-MM-DD HH:MM (TZ)" where TZ is local timezone abbreviation.
346/// - Returns original string if parsing fails.
347///
348/// Details:
349/// - Parses UTC date from AUR format
350/// - Converts to local timezone using system timezone
351/// - Formats with local timezone abbreviation
352fn convert_utc_to_local_date(utc_date_str: &str) -> String {
353 let utc_date_str = utc_date_str.trim();
354
355 // AUR format: "YYYY-MM-DD HH:MM (UTC)" or "YYYY-MM-DD HH:MM (CEST)" etc.
356 // Try to parse the date/time part before the timezone
357 if let Some(tz_start) = utc_date_str.rfind('(') {
358 let date_time_part = utc_date_str[..tz_start].trim();
359
360 // Try parsing "YYYY-MM-DD HH:MM" format as UTC
361 if let Ok(naive_dt) =
362 chrono::NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M")
363 {
364 // Treat as UTC and convert to local timezone
365 let utc_dt = naive_dt.and_utc();
366 let local_dt = utc_dt.with_timezone(&chrono::Local);
367
368 // Format with local timezone
369 // Format: "YYYY-MM-DD HH:MM (TZ)"
370 let formatted = local_dt.format("%Y-%m-%d %H:%M");
371
372 // Get timezone abbreviation
373 // Try multiple methods to get the actual timezone name (CEST, CET, etc.)
374 let tz_abbr = get_timezone_abbreviation(&local_dt);
375
376 return format!("{formatted} ({tz_abbr})");
377 }
378 }
379
380 // If parsing fails, return original string
381 utc_date_str.to_string()
382}
383
384/// What: Get timezone abbreviation (CEST, CET, PST, etc.) for a local datetime.
385///
386/// Inputs:
387/// - `local_dt`: Local datetime to get timezone for.
388///
389/// Output:
390/// - Timezone abbreviation string (e.g., "CEST", "CET", "UTC+2").
391///
392/// Details:
393/// - First tries chrono's %Z format specifier
394/// - Falls back to TZ environment variable parsing
395/// - Finally falls back to UTC offset format
396fn get_timezone_abbreviation(local_dt: &chrono::DateTime<chrono::Local>) -> String {
397 // Try chrono's %Z format specifier first
398 let tz_from_format = local_dt.format("%Z").to_string();
399
400 // Check if %Z gave us a valid abbreviation (3-6 chars, alphabetic)
401 if !tz_from_format.is_empty()
402 && tz_from_format.len() >= 3
403 && tz_from_format.len() <= 6
404 && tz_from_format.chars().all(char::is_alphabetic)
405 && !tz_from_format.starts_with("UTC")
406 {
407 return tz_from_format;
408 }
409
410 // Try to get timezone from TZ environment variable
411 if let Ok(tz_env) = std::env::var("TZ") {
412 // Extract timezone abbreviation from TZ variable
413 // TZ can be like "Europe/Berlin" or "CEST-2" or just "CEST"
414 if let Some(tz_name) = tz_env.rsplit('/').next() {
415 // Check if it looks like a timezone abbreviation (3-6 uppercase letters)
416 if tz_name.len() >= 3
417 && tz_name.len() <= 6
418 && tz_name.chars().all(|c| c.is_uppercase() || c == '-')
419 {
420 // Extract just the abbreviation part (before any offset)
421 let abbr = tz_name.split('-').next().unwrap_or(tz_name);
422 if abbr.len() >= 3 && abbr.chars().all(char::is_alphabetic) {
423 return abbr.to_string();
424 }
425 }
426 }
427 }
428
429 // Fallback: Try to determine timezone abbreviation from offset and date
430 let offset_secs = local_dt.offset().local_minus_utc();
431 let hours = offset_secs / 3600;
432 let minutes = (offset_secs.abs() % 3600) / 60;
433
434 // Try to get timezone abbreviation from common mappings based on offset
435 if let Some(tz_abbr) = get_tz_abbr_from_offset(hours, local_dt.date_naive()) {
436 return tz_abbr;
437 }
438
439 // Final fallback: Use UTC offset format
440 if offset_secs == 0 {
441 "UTC".to_string()
442 } else if minutes == 0 {
443 format!("UTC{hours:+}")
444 } else {
445 format!("UTC{hours:+}:{minutes:02}")
446 }
447}
448
449/// What: Get timezone abbreviation from UTC offset and date.
450///
451/// Inputs:
452/// - `offset_hours`: UTC offset in hours (e.g., 1, 2, -5).
453/// - `date`: Date (unused, kept for API compatibility).
454///
455/// Output:
456/// - `Some(String)` with timezone abbreviation if unambiguous; `None` otherwise.
457///
458/// Details:
459/// - Returns `None` for DST-affected timezones to avoid incorrect abbreviations
460/// - DST transition dates vary by year and region (e.g., US: second Sunday in March, first Sunday in November)
461/// - Month-based DST detection is inaccurate and can show wrong abbreviations near transitions
462/// - When `None` is returned, the caller falls back to UTC offset format (e.g., "UTC-5")
463/// - Only returns `Some` for unambiguous timezones like UTC
464fn get_tz_abbr_from_offset(offset_hours: i32, _date: chrono::NaiveDate) -> Option<String> {
465 // Only return abbreviations for unambiguous timezones
466 // For DST-affected timezones, return None to use UTC offset format instead
467 // This avoids incorrect abbreviations near DST transition dates
468 match offset_hours {
469 0 => Some("UTC".to_string()),
470 _ => None, // Return None for all other offsets to use UTC offset format
471 }
472}
473
474/// What: Parse a date string to Unix timestamp.
475///
476/// Inputs:
477/// - `date_str`: Date string from AUR page (e.g., "2025-05-15 03:55 (UTC)").
478///
479/// Output:
480/// - `Some(i64)` with Unix timestamp if parsing succeeds; `None` otherwise.
481///
482/// Details:
483/// - Attempts to parse common AUR date formats and many other common formats
484/// - AUR uses format: "YYYY-MM-DD HH:MM (TZ)" where TZ is timezone abbreviation
485/// - Supports ISO 8601, RFC 2822, RFC 3339, and various date separator formats
486/// - Returns None if parsing fails (will use string comparison for sorting)
487/// - Logs debug information when parsing fails to help diagnose issues
488fn parse_date_to_timestamp(date_str: &str) -> Option<i64> {
489 let date_str = date_str.trim();
490
491 // Skip empty strings early
492 if date_str.is_empty() {
493 debug!("Failed to parse empty date string");
494 return None;
495 }
496
497 // AUR format: "YYYY-MM-DD HH:MM (UTC)" or "YYYY-MM-DD HH:MM (CEST)" etc.
498 // Try to parse the date/time part before the timezone
499 if let Some(tz_start) = date_str.rfind('(') {
500 let date_time_part = date_str[..tz_start].trim();
501
502 // Try parsing "YYYY-MM-DD HH:MM" format
503 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M") {
504 // AUR dates are in UTC, so we can treat them as UTC
505 return dt.and_utc().timestamp().into();
506 }
507
508 // Try with seconds: "YYYY-MM-DD HH:MM:SS"
509 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_time_part, "%Y-%m-%d %H:%M:%S") {
510 return dt.and_utc().timestamp().into();
511 }
512 }
513
514 // Try ISO 8601-like format: "YYYY-MM-DD HH:MM:SS"
515 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%Y-%m-%d %H:%M:%S") {
516 return dt.and_utc().timestamp().into();
517 }
518
519 // Try ISO 8601 format: "YYYY-MM-DDTHH:MM:SS" (with T separator)
520 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S") {
521 return dt.and_utc().timestamp().into();
522 }
523
524 // Try ISO 8601 with timezone: "YYYY-MM-DDTHH:MM:SSZ" or "YYYY-MM-DDTHH:MM:SS+HH:MM"
525 if let Ok(dt) = chrono::DateTime::parse_from_str(date_str, "%Y-%m-%dT%H:%M:%S%z") {
526 return Some(dt.timestamp());
527 }
528
529 // Try date-only format: "YYYY-MM-DD"
530 if let Ok(d) = chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d")
531 && let Some(dt) = d.and_hms_opt(0, 0, 0)
532 {
533 return dt.and_utc().timestamp().into();
534 }
535
536 // Try RFC 2822 format (e.g., "Mon, 15 May 2025 03:55:00 +0000")
537 if let Ok(dt) = chrono::DateTime::parse_from_rfc2822(date_str) {
538 return Some(dt.timestamp());
539 }
540
541 // Try RFC 3339 format (e.g., "2025-05-15T03:55:00Z")
542 if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(date_str) {
543 return Some(dt.timestamp());
544 }
545
546 // Try formats with different separators: "YYYY/MM/DD HH:MM"
547 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%Y/%m/%d %H:%M") {
548 return dt.and_utc().timestamp().into();
549 }
550
551 // Try formats with different separators: "DD.MM.YYYY HH:MM"
552 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%d.%m.%Y %H:%M") {
553 return dt.and_utc().timestamp().into();
554 }
555
556 // Try formats with different separators: "MM/DD/YYYY HH:MM"
557 if let Ok(dt) = chrono::NaiveDateTime::parse_from_str(date_str, "%m/%d/%Y %H:%M") {
558 return dt.and_utc().timestamp().into();
559 }
560
561 // Try Unix timestamp as string
562 if let Ok(ts) = date_str.parse::<i64>() {
563 // Validate it's a reasonable timestamp (between 2000 and 2100)
564 if ts > 946_684_800 && ts < 4_102_444_800 {
565 return Some(ts);
566 }
567 }
568
569 // All parsing attempts failed - log for debugging
570 debug!(
571 date_str = %date_str,
572 date_str_len = date_str.len(),
573 "Failed to parse date string to timestamp"
574 );
575 None
576}
577
578/// What: Convert HTML content to formatted text preserving markdown-like structures.
579///
580/// Inputs:
581/// - `element`: HTML element to parse
582///
583/// Output:
584/// - Formatted text string with markdown-like syntax for bold, italic, code, etc.
585///
586/// Details:
587/// - Converts HTML tags to markdown-like syntax:
588/// - `<strong>`, `<b>` → `**text**`
589/// - `<em>`, `<i>` → `*text*`
590/// - `<code>` → `` `text` ``
591/// - `<pre>` → preserves code blocks with triple backticks
592/// - `<a>` → preserves links as `[text](url)`
593/// - `<p>` → newlines between paragraphs
594fn html_to_formatted_text(element: ElementRef) -> String {
595 let mut result = String::new();
596
597 // Process paragraphs to preserve structure
598 let p_selector = Selector::parse("p").ok();
599 if let Some(ref p_sel) = p_selector {
600 let paragraphs: Vec<_> = element.select(p_sel).collect();
601 if !paragraphs.is_empty() {
602 for (i, para) in paragraphs.iter().enumerate() {
603 if i > 0 {
604 result.push('\n');
605 }
606 result.push_str(&convert_element_to_markdown(para));
607 }
608 return result.trim().to_string();
609 }
610 }
611
612 // If no paragraphs, process the whole element
613 result = convert_element_to_markdown(&element);
614 result.trim().to_string()
615}
616
617/// Convert an HTML element to markdown-like syntax by processing nested elements.
618fn convert_element_to_markdown(element: &ElementRef) -> String {
619 let html = element.html();
620 let mut working_html = html;
621
622 // Process <pre> blocks first (code blocks)
623 let pre_selector = Selector::parse("pre").ok();
624 if let Some(ref pre_sel) = pre_selector {
625 for pre in element.select(pre_sel) {
626 let text = pre.text().collect::<String>();
627 let pre_html = pre.html();
628 let replacement = format!("```\n{}\n```", text.trim());
629 working_html = working_html.replace(&pre_html, &replacement);
630 }
631 }
632
633 // Process <a> tags (links)
634 let a_selector = Selector::parse("a").ok();
635 if let Some(ref a_sel) = a_selector {
636 for link in element.select(a_sel) {
637 let text = link.text().collect::<String>().trim().to_string();
638 if let Some(href) = link.value().attr("href") {
639 let link_html = link.html();
640 let replacement = format!("[{text}]({href})");
641 working_html = working_html.replace(&link_html, &replacement);
642 }
643 }
644 }
645
646 // Process <strong> and <b> tags (bold)
647 let strong_selector = Selector::parse("strong, b").ok();
648 if let Some(ref strong_sel) = strong_selector {
649 for bold in element.select(strong_sel) {
650 let text = bold.text().collect::<String>().trim().to_string();
651 if !text.is_empty() {
652 let bold_html = bold.html();
653 let replacement = format!("**{text}**");
654 working_html = working_html.replace(&bold_html, &replacement);
655 }
656 }
657 }
658
659 // Process <em> and <i> tags (italic)
660 let em_selector = Selector::parse("em, i").ok();
661 if let Some(ref em_sel) = em_selector {
662 for italic in element.select(em_sel) {
663 let text = italic.text().collect::<String>().trim().to_string();
664 if !text.is_empty() {
665 let italic_html = italic.html();
666 let replacement = format!("*{text}*");
667 working_html = working_html.replace(&italic_html, &replacement);
668 }
669 }
670 }
671
672 // Process <code> tags
673 let code_selector = Selector::parse("code").ok();
674 if let Some(ref code_sel) = code_selector {
675 for code in element.select(code_sel) {
676 let text = code.text().collect::<String>().trim().to_string();
677 if !text.is_empty() {
678 let code_html = code.html();
679 let replacement = format!("`{text}`");
680 working_html = working_html.replace(&code_html, &replacement);
681 }
682 }
683 }
684
685 // Parse the modified HTML and extract text (this removes remaining HTML tags)
686 let temp_doc = Html::parse_fragment(&working_html);
687 let mut result = temp_doc.root_element().text().collect::<String>();
688
689 // Decode HTML entities
690 result = result
691 .replace("<", "<")
692 .replace(">", ">")
693 .replace("&", "&")
694 .replace(""", "\"")
695 .replace("'", "'")
696 .replace(" ", " ");
697
698 result
699}
700
701#[cfg(test)]
702mod tests {
703 use super::*;
704
705 /// What: Test that DST-affected timezones return None to use UTC offset format.
706 ///
707 /// Inputs:
708 /// - Various dates and offsets for DST-affected timezones
709 ///
710 /// Output:
711 /// - Function should return None to fall back to UTC offset format
712 ///
713 /// Details:
714 /// - DST transition dates vary by year and region
715 /// - US DST: second Sunday in March to first Sunday in November
716 /// - Month-based detection is inaccurate, so we use UTC offset format instead
717 #[test]
718 fn test_dst_affected_timezones_return_none() {
719 // Test various dates that would be incorrectly handled by month-based DST detection
720 let test_cases = vec![
721 (
722 chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
723 -5,
724 ), // Early March (before DST starts)
725 (
726 chrono::NaiveDate::from_ymd_opt(2024, 3, 15).expect("valid test date"),
727 -5,
728 ), // Mid March (after DST starts)
729 (
730 chrono::NaiveDate::from_ymd_opt(2024, 10, 31).expect("valid test date"),
731 -5,
732 ), // Late October (DST still active)
733 (
734 chrono::NaiveDate::from_ymd_opt(2024, 11, 4).expect("valid test date"),
735 -5,
736 ), // Early November (after DST ends)
737 (
738 chrono::NaiveDate::from_ymd_opt(2024, 11, 15).expect("valid test date"),
739 -5,
740 ), // Mid November (after DST ends)
741 // Test other US timezones
742 (
743 chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
744 -6,
745 ), // Central Time
746 (
747 chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
748 -7,
749 ), // Mountain Time
750 (
751 chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
752 -8,
753 ), // Pacific Time
754 // Test European timezones
755 (
756 chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
757 1,
758 ), // CET/CEST
759 (
760 chrono::NaiveDate::from_ymd_opt(2024, 3, 1).expect("valid test date"),
761 2,
762 ), // CEST/EET
763 ];
764
765 for (date, offset) in test_cases {
766 let result = get_tz_abbr_from_offset(offset, date);
767 // Should return None to use UTC offset format
768 // This is safer than guessing DST based on month ranges
769 assert!(
770 result.is_none(),
771 "Should return None for DST-affected timezones to use UTC offset format. Date: {date:?}, Offset: {offset}, Got: {result:?}"
772 );
773 }
774 }
775
776 /// What: Test that UTC (offset 0) returns the correct abbreviation.
777 ///
778 /// Inputs:
779 /// - Offset: 0 (UTC)
780 /// - Various dates
781 ///
782 /// Output:
783 /// - Should return "UTC" since it's unambiguous
784 ///
785 /// Details:
786 /// - UTC is not affected by DST, so it's safe to return the abbreviation
787 #[test]
788 fn test_utc_returns_abbreviation() {
789 let test_dates = vec![
790 chrono::NaiveDate::from_ymd_opt(2024, 1, 1).expect("valid test date"),
791 chrono::NaiveDate::from_ymd_opt(2024, 6, 15).expect("valid test date"),
792 chrono::NaiveDate::from_ymd_opt(2024, 12, 31).expect("valid test date"),
793 ];
794
795 for date in test_dates {
796 let result = get_tz_abbr_from_offset(0, date);
797 assert_eq!(
798 result,
799 Some("UTC".to_string()),
800 "UTC should always return 'UTC' abbreviation. Date: {date:?}, Got: {result:?}"
801 );
802 }
803 }
804
805 /// What: Test date parsing with various AUR date formats.
806 ///
807 /// Inputs:
808 /// - Various date string formats that might come from AUR
809 ///
810 /// Output:
811 /// - Should successfully parse valid AUR date formats
812 ///
813 /// Details:
814 /// - Tests common AUR date formats including UTC+2 format
815 #[test]
816 fn test_parse_date_to_timestamp() {
817 // Test standard AUR formats
818 assert!(
819 parse_date_to_timestamp("2025-04-14 11:52 (UTC)").is_some(),
820 "Should parse UTC format"
821 );
822 assert!(
823 parse_date_to_timestamp("2025-04-14 11:52 (CEST)").is_some(),
824 "Should parse CEST format"
825 );
826 assert!(
827 parse_date_to_timestamp("2025-04-14 11:52 (UTC+2)").is_some(),
828 "Should parse UTC+2 format"
829 );
830 assert!(
831 parse_date_to_timestamp("2024-12-01 10:00 (UTC)").is_some(),
832 "Should parse December date"
833 );
834
835 // Test edge cases
836 assert!(
837 parse_date_to_timestamp("").is_none(),
838 "Empty string should return None"
839 );
840 assert!(
841 parse_date_to_timestamp("invalid date").is_none(),
842 "Invalid date should return None"
843 );
844
845 // Test ISO 8601 formats
846 assert!(
847 parse_date_to_timestamp("2025-04-14 11:52:30").is_some(),
848 "Should parse ISO 8601-like format with seconds"
849 );
850 assert!(
851 parse_date_to_timestamp("2025-04-14T11:52:30").is_some(),
852 "Should parse ISO 8601 format with T separator"
853 );
854
855 // Test date-only format
856 assert!(
857 parse_date_to_timestamp("2025-04-14").is_some(),
858 "Should parse date-only format"
859 );
860
861 // Test alternative separator formats
862 assert!(
863 parse_date_to_timestamp("2025/04/14 11:52").is_some(),
864 "Should parse format with / separators"
865 );
866 assert!(
867 parse_date_to_timestamp("14.04.2025 11:52").is_some(),
868 "Should parse DD.MM.YYYY format"
869 );
870 assert!(
871 parse_date_to_timestamp("04/14/2025 11:52").is_some(),
872 "Should parse MM/DD/YYYY format"
873 );
874
875 // Test Unix timestamp as string
876 assert!(
877 parse_date_to_timestamp("1735689600").is_some(),
878 "Should parse Unix timestamp string"
879 );
880
881 // Verify the parsed timestamp is reasonable
882 if let Some(ts) = parse_date_to_timestamp("2025-04-14 11:52 (UTC)") {
883 // April 14, 2025 should be a valid future timestamp
884 assert!(ts > 0, "Timestamp should be positive");
885 }
886
887 // Verify timestamps are consistent across formats
888 let ts1 = parse_date_to_timestamp("2025-04-14 11:52 (UTC)");
889 let ts2 = parse_date_to_timestamp("2025-04-14 11:52:00");
890 assert_eq!(
891 ts1, ts2,
892 "Same date/time should produce same timestamp regardless of format"
893 );
894 }
895}