pacsea/sources/news/
parse.rs

1//! HTML parsing and rendering for news content.
2
3use crate::sources::news::utils::{extract_origin, is_arch_package_url, resolve_href};
4use ego_tree::NodeRef;
5use scraper::{ElementRef, Html, Node, Selector};
6
7/// What: Parse Arch Linux news HTML and extract article text using `scraper`.
8///
9/// Inputs:
10/// - `html`: Raw HTML content of the news page.
11///
12/// Output:
13/// - Extracted article text with formatting preserved (paragraphs, bullets, code markers).
14pub fn parse_arch_news_html(html: &str, base_url: Option<&str>) -> String {
15    let document = Html::parse_document(html);
16    let base_origin = base_url.and_then(extract_origin);
17    let is_pkg_page = base_url.is_some_and(is_arch_package_url);
18    let selectors = [
19        Selector::parse("div.advisory").ok(),
20        Selector::parse("div.article-content").ok(),
21        Selector::parse("article").ok(),
22    ];
23
24    let mut buf = String::new();
25    let mut found = false;
26    for sel in selectors.iter().flatten() {
27        if let Some(element) = document.select(sel).next()
28            && let Some(node) = document.tree.get(element.id())
29        {
30            let preserve_ws = element
31                .value()
32                .attr("class")
33                .is_some_and(|c| c.contains("advisory"));
34            render_node(&mut buf, node, false, preserve_ws, base_origin.as_deref());
35            found = true;
36            break;
37        }
38    }
39    if !found && let Some(root) = document.tree.get(document.root_element().id()) {
40        render_node(&mut buf, root, false, false, base_origin.as_deref());
41    }
42
43    let main = prune_news_boilerplate(&buf);
44    if !is_pkg_page {
45        return main;
46    }
47
48    let meta_block = extract_package_metadata(&document, base_origin.as_deref());
49    if meta_block.is_empty() {
50        return main;
51    }
52
53    let mut combined = String::new();
54    combined.push_str("Package Info:\n");
55    for line in meta_block {
56        combined.push_str(&line);
57        combined.push('\n');
58    }
59    combined.push('\n');
60    combined.push_str(&main);
61    combined
62}
63
64/// What: Render a node (and children) into text while preserving basic formatting.
65///
66/// Inputs:
67/// - `buf`: Output buffer to append text into
68/// - `node`: Node to render
69/// - `in_pre`: Whether we are inside a <pre> block (preserve whitespace)
70/// - `preserve_ws`: Whether to avoid collapsing whitespace (advisory pages).
71fn render_node(
72    buf: &mut String,
73    node: NodeRef<Node>,
74    in_pre: bool,
75    preserve_ws: bool,
76    base_origin: Option<&str>,
77) {
78    match node.value() {
79        Node::Text(t) => push_text(buf, t.as_ref(), in_pre, preserve_ws),
80        Node::Element(el) => {
81            let name = el.name();
82            let is_block = matches!(
83                name,
84                "p" | "div"
85                    | "section"
86                    | "article"
87                    | "header"
88                    | "footer"
89                    | "main"
90                    | "table"
91                    | "tr"
92                    | "td"
93            );
94            let is_list = matches!(name, "ul" | "ol");
95            let is_li = name == "li";
96            let is_br = name == "br";
97            let is_pre_tag = name == "pre";
98            let is_code = name == "code";
99            let is_anchor = name == "a";
100
101            if is_block && !buf.ends_with('\n') {
102                buf.push('\n');
103            }
104            if is_li {
105                if !buf.ends_with('\n') {
106                    buf.push('\n');
107                }
108                buf.push_str("• ");
109            }
110            if is_br {
111                buf.push('\n');
112            }
113
114            if is_anchor {
115                let mut tmp = String::new();
116                for child in node.children() {
117                    render_node(&mut tmp, child, in_pre, preserve_ws, base_origin);
118                }
119                let label = tmp.trim();
120                let href = el
121                    .attr("href")
122                    .map(str::trim)
123                    .filter(|h| !h.is_empty())
124                    .unwrap_or_default();
125                if !href.is_empty() {
126                    if !buf.ends_with('\n') && !buf.ends_with(' ') {
127                        buf.push(' ');
128                    }
129                    if label.is_empty() {
130                        buf.push_str(&resolve_href(href, base_origin));
131                    } else {
132                        buf.push_str(label);
133                        buf.push(' ');
134                        buf.push('(');
135                        buf.push_str(&resolve_href(href, base_origin));
136                        buf.push(')');
137                    }
138                } else if !label.is_empty() {
139                    buf.push_str(label);
140                }
141                return;
142            }
143
144            if is_code {
145                let mut tmp = String::new();
146                for child in node.children() {
147                    render_node(&mut tmp, child, in_pre, preserve_ws, base_origin);
148                }
149                if !tmp.is_empty() {
150                    if !buf.ends_with('`') {
151                        buf.push('`');
152                    }
153                    buf.push_str(tmp.trim());
154                    buf.push('`');
155                }
156                return;
157            }
158
159            if is_pre_tag {
160                if !buf.ends_with('\n') {
161                    buf.push('\n');
162                }
163                let mut tmp = String::new();
164                for child in node.children() {
165                    render_node(&mut tmp, child, true, preserve_ws, base_origin);
166                }
167                buf.push_str(tmp.trim_end());
168                buf.push('\n');
169                return;
170            }
171
172            let next_pre = in_pre;
173            for child in node.children() {
174                render_node(buf, child, next_pre, preserve_ws, base_origin);
175            }
176
177            if is_block || is_list || is_li {
178                if !buf.ends_with('\n') {
179                    buf.push('\n');
180                }
181                if !buf.ends_with("\n\n") {
182                    buf.push('\n');
183                }
184            }
185        }
186        _ => {}
187    }
188}
189
190/// What: Append text content to buffer, preserving whitespace when in <pre>, otherwise collapsing runs.
191///
192/// Inputs:
193/// - `buf`: Output buffer to append into.
194/// - `text`: Text content from the node.
195/// - `in_pre`: Whether whitespace should be preserved (inside `<pre>`).
196/// - `preserve_ws`: Whether to avoid collapsing whitespace for advisory pages.
197///
198/// Output:
199/// - Mutates `buf` with appended text respecting whitespace rules.
200fn push_text(buf: &mut String, text: &str, in_pre: bool, preserve_ws: bool) {
201    if in_pre {
202        buf.push_str(text);
203        return;
204    }
205    if preserve_ws {
206        buf.push_str(text);
207        return;
208    }
209
210    // Collapse consecutive whitespace to a single space, but keep newlines produced by block tags.
211    let mut last_was_space = buf.ends_with(' ');
212    for ch in text.chars() {
213        if ch.is_whitespace() {
214            if !last_was_space {
215                buf.push(' ');
216                last_was_space = true;
217            }
218        } else {
219            buf.push(ch);
220            last_was_space = false;
221        }
222    }
223}
224
225/// What: Remove Arch news boilerplate (nav/header) from extracted text.
226///
227/// Inputs:
228/// - `text`: Plain text extracted from the news HTML.
229///
230/// Output:
231/// - Text with leading navigation/header lines removed, starting after the date line when found.
232pub fn prune_news_boilerplate(text: &str) -> String {
233    let lines: Vec<&str> = text.lines().collect();
234    // Find a date line like YYYY-MM-DD ...
235    let date_idx = lines.iter().position(|l| {
236        let t = l.trim();
237        t.len() >= 10
238            && t.as_bytes().get(4) == Some(&b'-')
239            && t.as_bytes().get(7) == Some(&b'-')
240            && t[..4].chars().all(|c| c.is_ascii_digit())
241            && t[5..7].chars().all(|c| c.is_ascii_digit())
242            && t[8..10].chars().all(|c| c.is_ascii_digit())
243    });
244
245    if let Some(idx) = date_idx {
246        // Take everything after the date line
247        let mut out: Vec<&str> = lines.iter().skip(idx + 1).map(|s| s.trim_end()).collect();
248        // Drop leading empty lines
249        while matches!(out.first(), Some(l) if l.trim().is_empty()) {
250            out.remove(0);
251        }
252        // Drop footer/copyright block if present
253        if let Some(c_idx) = out.iter().position(|l| l.contains("Copyright \u{00a9}")) {
254            out.truncate(c_idx);
255        }
256        // Also drop known footer lines
257        out.retain(|l| {
258            let t = l.trim();
259            !(t.starts_with("The Arch Linux name and logo")
260                || t.starts_with("trademarks.")
261                || t.starts_with("The registered trademark")
262                || t.starts_with("Linux\u{00ae} is used")
263                || t.starts_with("the exclusive licensee"))
264        });
265        return collapse_blank_lines(&out);
266    }
267
268    // Advisory pages don't match the date format; drop leading navigation until the first meaningful header
269    let mut start = lines
270        .iter()
271        .position(|l| {
272            let t = l.trim();
273            t.starts_with("Arch Linux Security Advisory")
274                || t.starts_with("Severity:")
275                || t.starts_with("CVE-")
276        })
277        .unwrap_or(0);
278    while start < lines.len() && {
279        let t = lines[start].trim();
280        t.is_empty() || t.starts_with('•') || t == "Arch Linux"
281    } {
282        start += 1;
283    }
284    let mut out: Vec<&str> = lines
285        .iter()
286        .skip(start)
287        .map(|s| s.trim_end_matches('\r'))
288        .collect();
289    while matches!(out.first(), Some(l) if l.trim().is_empty() || l.trim().starts_with('•')) {
290        out.remove(0);
291    }
292    collapse_blank_lines(&out)
293}
294
295/// What: Collapse multiple consecutive blank lines into a single blank line and trim trailing blanks.
296pub fn collapse_blank_lines(lines: &[&str]) -> String {
297    let mut out = Vec::with_capacity(lines.len());
298    let mut last_was_blank = false;
299    for l in lines {
300        let blank = l.trim().is_empty();
301        if blank && last_was_blank {
302            continue;
303        }
304        out.push(l.trim_end());
305        last_was_blank = blank;
306    }
307    while matches!(out.last(), Some(l) if l.trim().is_empty()) {
308        out.pop();
309    }
310    out.join("\n")
311}
312
313/// What: Extract selected metadata fields from an Arch package HTML page.
314fn extract_package_metadata(document: &Html, base_origin: Option<&str>) -> Vec<String> {
315    let wanted = [
316        "Upstream URL",
317        "License(s)",
318        "Maintainers",
319        "Package Size",
320        "Installed Size",
321        "Last Packager",
322        "Build Date",
323    ];
324    let wanted_set: std::collections::HashSet<&str> = wanted.into_iter().collect();
325    let row_sel = Selector::parse("tr").ok();
326    let th_sel = Selector::parse("th").ok();
327    let td_selector = Selector::parse("td").ok();
328    let dt_sel = Selector::parse("dt").ok();
329    let dd_selector = Selector::parse("dd").ok();
330    let mut fields: Vec<(String, String)> = Vec::new();
331    if let (Some(row_sel), Some(th_sel), Some(td_sel)) = (row_sel, th_sel, td_selector) {
332        for tr in document.select(&row_sel) {
333            let th_text = normalize_label(
334                &tr.select(&th_sel)
335                    .next()
336                    .map(|th| th.text().collect::<String>())
337                    .unwrap_or_default(),
338            );
339            if !wanted_set.contains(th_text.as_str()) {
340                continue;
341            }
342            if let Some(td) = tr.select(&td_sel).next() {
343                let value = extract_inline(&td, base_origin);
344                if !value.is_empty() {
345                    fields.push((th_text, value));
346                }
347            }
348        }
349    }
350    if let (Some(dt_sel), Some(_dd_sel)) = (dt_sel, dd_selector) {
351        for dt in document.select(&dt_sel) {
352            let label = normalize_label(&dt.text().collect::<String>());
353            if !wanted_set.contains(label.as_str()) {
354                continue;
355            }
356            // Prefer the immediate following sibling <dd>
357            if let Some(dd) = dt
358                .next_sibling()
359                .and_then(ElementRef::wrap)
360                .filter(|sib| sib.value().name() == "dd")
361                .or_else(|| dt.next_siblings().find_map(ElementRef::wrap))
362            {
363                let value = extract_inline(&dd, base_origin);
364                if !value.is_empty() {
365                    fields.push((label, value));
366                }
367            }
368        }
369    }
370    fields
371        .into_iter()
372        .map(|(k, v)| format!("{k}: {v}"))
373        .collect()
374}
375
376/// What: Extract inline text (with resolved links) from a node subtree.
377fn extract_inline(node: &NodeRef<Node>, base_origin: Option<&str>) -> String {
378    let mut parts: Vec<String> = Vec::new();
379    for child in node.children() {
380        match child.value() {
381            Node::Text(t) => {
382                let text = t.trim();
383                if !text.is_empty() {
384                    parts.push(text.to_string());
385                }
386            }
387            Node::Element(el) => {
388                if el.name() == "a" {
389                    let label = ElementRef::wrap(child)
390                        .map(|e| e.text().collect::<String>())
391                        .unwrap_or_default()
392                        .trim()
393                        .to_string();
394                    let href = el
395                        .attr("href")
396                        .map(str::trim)
397                        .filter(|h| !h.is_empty())
398                        .map(|h| resolve_href(h, base_origin))
399                        .unwrap_or_default();
400                    if !label.is_empty() && !href.is_empty() {
401                        parts.push(format!("{label} ({href})"));
402                    } else if !label.is_empty() {
403                        parts.push(label);
404                    } else if !href.is_empty() {
405                        parts.push(href);
406                    }
407                } else {
408                    let inline = extract_inline(&child, base_origin);
409                    if !inline.is_empty() {
410                        parts.push(inline);
411                    }
412                }
413            }
414            _ => {}
415        }
416    }
417    parts
418        .join(" ")
419        .split_whitespace()
420        .collect::<Vec<_>>()
421        .join(" ")
422}
423
424/// What: Normalize table/header labels for matching (trim and drop trailing colon).
425fn normalize_label(raw: &str) -> String {
426    raw.trim().trim_end_matches(':').trim().to_string()
427}
pacsea/sources/news/parse.rs

pacsea/sources/news/
parse.rs