pacsea/sources/news/
parse.rs1use crate::sources::news::utils::{extract_origin, is_arch_package_url, resolve_href};
4use ego_tree::NodeRef;
5use scraper::{ElementRef, Html, Node, Selector};
6
7pub fn parse_arch_news_html(html: &str, base_url: Option<&str>) -> String {
15 let document = Html::parse_document(html);
16 let base_origin = base_url.and_then(extract_origin);
17 let is_pkg_page = base_url.is_some_and(is_arch_package_url);
18 let selectors = [
19 Selector::parse("div.advisory").ok(),
20 Selector::parse("div.article-content").ok(),
21 Selector::parse("article").ok(),
22 ];
23
24 let mut buf = String::new();
25 let mut found = false;
26 for sel in selectors.iter().flatten() {
27 if let Some(element) = document.select(sel).next()
28 && let Some(node) = document.tree.get(element.id())
29 {
30 let preserve_ws = element
31 .value()
32 .attr("class")
33 .is_some_and(|c| c.contains("advisory"));
34 render_node(&mut buf, node, false, preserve_ws, base_origin.as_deref());
35 found = true;
36 break;
37 }
38 }
39 if !found && let Some(root) = document.tree.get(document.root_element().id()) {
40 render_node(&mut buf, root, false, false, base_origin.as_deref());
41 }
42
43 let main = prune_news_boilerplate(&buf);
44 if !is_pkg_page {
45 return main;
46 }
47
48 let meta_block = extract_package_metadata(&document, base_origin.as_deref());
49 if meta_block.is_empty() {
50 return main;
51 }
52
53 let mut combined = String::new();
54 combined.push_str("Package Info:\n");
55 for line in meta_block {
56 combined.push_str(&line);
57 combined.push('\n');
58 }
59 combined.push('\n');
60 combined.push_str(&main);
61 combined
62}
63
64fn render_node(
72 buf: &mut String,
73 node: NodeRef<Node>,
74 in_pre: bool,
75 preserve_ws: bool,
76 base_origin: Option<&str>,
77) {
78 match node.value() {
79 Node::Text(t) => push_text(buf, t.as_ref(), in_pre, preserve_ws),
80 Node::Element(el) => {
81 let name = el.name();
82 let is_block = matches!(
83 name,
84 "p" | "div"
85 | "section"
86 | "article"
87 | "header"
88 | "footer"
89 | "main"
90 | "table"
91 | "tr"
92 | "td"
93 );
94 let is_list = matches!(name, "ul" | "ol");
95 let is_li = name == "li";
96 let is_br = name == "br";
97 let is_pre_tag = name == "pre";
98 let is_code = name == "code";
99 let is_anchor = name == "a";
100
101 if is_block && !buf.ends_with('\n') {
102 buf.push('\n');
103 }
104 if is_li {
105 if !buf.ends_with('\n') {
106 buf.push('\n');
107 }
108 buf.push_str("• ");
109 }
110 if is_br {
111 buf.push('\n');
112 }
113
114 if is_anchor {
115 let mut tmp = String::new();
116 for child in node.children() {
117 render_node(&mut tmp, child, in_pre, preserve_ws, base_origin);
118 }
119 let label = tmp.trim();
120 let href = el
121 .attr("href")
122 .map(str::trim)
123 .filter(|h| !h.is_empty())
124 .unwrap_or_default();
125 if !href.is_empty() {
126 if !buf.ends_with('\n') && !buf.ends_with(' ') {
127 buf.push(' ');
128 }
129 if label.is_empty() {
130 buf.push_str(&resolve_href(href, base_origin));
131 } else {
132 buf.push_str(label);
133 buf.push(' ');
134 buf.push('(');
135 buf.push_str(&resolve_href(href, base_origin));
136 buf.push(')');
137 }
138 } else if !label.is_empty() {
139 buf.push_str(label);
140 }
141 return;
142 }
143
144 if is_code {
145 let mut tmp = String::new();
146 for child in node.children() {
147 render_node(&mut tmp, child, in_pre, preserve_ws, base_origin);
148 }
149 if !tmp.is_empty() {
150 if !buf.ends_with('`') {
151 buf.push('`');
152 }
153 buf.push_str(tmp.trim());
154 buf.push('`');
155 }
156 return;
157 }
158
159 if is_pre_tag {
160 if !buf.ends_with('\n') {
161 buf.push('\n');
162 }
163 let mut tmp = String::new();
164 for child in node.children() {
165 render_node(&mut tmp, child, true, preserve_ws, base_origin);
166 }
167 buf.push_str(tmp.trim_end());
168 buf.push('\n');
169 return;
170 }
171
172 let next_pre = in_pre;
173 for child in node.children() {
174 render_node(buf, child, next_pre, preserve_ws, base_origin);
175 }
176
177 if is_block || is_list || is_li {
178 if !buf.ends_with('\n') {
179 buf.push('\n');
180 }
181 if !buf.ends_with("\n\n") {
182 buf.push('\n');
183 }
184 }
185 }
186 _ => {}
187 }
188}
189
190fn push_text(buf: &mut String, text: &str, in_pre: bool, preserve_ws: bool) {
201 if in_pre {
202 buf.push_str(text);
203 return;
204 }
205 if preserve_ws {
206 buf.push_str(text);
207 return;
208 }
209
210 let mut last_was_space = buf.ends_with(' ');
212 for ch in text.chars() {
213 if ch.is_whitespace() {
214 if !last_was_space {
215 buf.push(' ');
216 last_was_space = true;
217 }
218 } else {
219 buf.push(ch);
220 last_was_space = false;
221 }
222 }
223}
224
225pub fn prune_news_boilerplate(text: &str) -> String {
233 let lines: Vec<&str> = text.lines().collect();
234 let date_idx = lines.iter().position(|l| {
236 let t = l.trim();
237 t.len() >= 10
238 && t.as_bytes().get(4) == Some(&b'-')
239 && t.as_bytes().get(7) == Some(&b'-')
240 && t[..4].chars().all(|c| c.is_ascii_digit())
241 && t[5..7].chars().all(|c| c.is_ascii_digit())
242 && t[8..10].chars().all(|c| c.is_ascii_digit())
243 });
244
245 if let Some(idx) = date_idx {
246 let mut out: Vec<&str> = lines.iter().skip(idx + 1).map(|s| s.trim_end()).collect();
248 while matches!(out.first(), Some(l) if l.trim().is_empty()) {
250 out.remove(0);
251 }
252 if let Some(c_idx) = out.iter().position(|l| l.contains("Copyright \u{00a9}")) {
254 out.truncate(c_idx);
255 }
256 out.retain(|l| {
258 let t = l.trim();
259 !(t.starts_with("The Arch Linux name and logo")
260 || t.starts_with("trademarks.")
261 || t.starts_with("The registered trademark")
262 || t.starts_with("Linux\u{00ae} is used")
263 || t.starts_with("the exclusive licensee"))
264 });
265 return collapse_blank_lines(&out);
266 }
267
268 let mut start = lines
270 .iter()
271 .position(|l| {
272 let t = l.trim();
273 t.starts_with("Arch Linux Security Advisory")
274 || t.starts_with("Severity:")
275 || t.starts_with("CVE-")
276 })
277 .unwrap_or(0);
278 while start < lines.len() && {
279 let t = lines[start].trim();
280 t.is_empty() || t.starts_with('•') || t == "Arch Linux"
281 } {
282 start += 1;
283 }
284 let mut out: Vec<&str> = lines
285 .iter()
286 .skip(start)
287 .map(|s| s.trim_end_matches('\r'))
288 .collect();
289 while matches!(out.first(), Some(l) if l.trim().is_empty() || l.trim().starts_with('•')) {
290 out.remove(0);
291 }
292 collapse_blank_lines(&out)
293}
294
295pub fn collapse_blank_lines(lines: &[&str]) -> String {
297 let mut out = Vec::with_capacity(lines.len());
298 let mut last_was_blank = false;
299 for l in lines {
300 let blank = l.trim().is_empty();
301 if blank && last_was_blank {
302 continue;
303 }
304 out.push(l.trim_end());
305 last_was_blank = blank;
306 }
307 while matches!(out.last(), Some(l) if l.trim().is_empty()) {
308 out.pop();
309 }
310 out.join("\n")
311}
312
313fn extract_package_metadata(document: &Html, base_origin: Option<&str>) -> Vec<String> {
315 let wanted = [
316 "Upstream URL",
317 "License(s)",
318 "Maintainers",
319 "Package Size",
320 "Installed Size",
321 "Last Packager",
322 "Build Date",
323 ];
324 let wanted_set: std::collections::HashSet<&str> = wanted.into_iter().collect();
325 let row_sel = Selector::parse("tr").ok();
326 let th_sel = Selector::parse("th").ok();
327 let td_selector = Selector::parse("td").ok();
328 let dt_sel = Selector::parse("dt").ok();
329 let dd_selector = Selector::parse("dd").ok();
330 let mut fields: Vec<(String, String)> = Vec::new();
331 if let (Some(row_sel), Some(th_sel), Some(td_sel)) = (row_sel, th_sel, td_selector) {
332 for tr in document.select(&row_sel) {
333 let th_text = normalize_label(
334 &tr.select(&th_sel)
335 .next()
336 .map(|th| th.text().collect::<String>())
337 .unwrap_or_default(),
338 );
339 if !wanted_set.contains(th_text.as_str()) {
340 continue;
341 }
342 if let Some(td) = tr.select(&td_sel).next() {
343 let value = extract_inline(&td, base_origin);
344 if !value.is_empty() {
345 fields.push((th_text, value));
346 }
347 }
348 }
349 }
350 if let (Some(dt_sel), Some(_dd_sel)) = (dt_sel, dd_selector) {
351 for dt in document.select(&dt_sel) {
352 let label = normalize_label(&dt.text().collect::<String>());
353 if !wanted_set.contains(label.as_str()) {
354 continue;
355 }
356 if let Some(dd) = dt
358 .next_sibling()
359 .and_then(ElementRef::wrap)
360 .filter(|sib| sib.value().name() == "dd")
361 .or_else(|| dt.next_siblings().find_map(ElementRef::wrap))
362 {
363 let value = extract_inline(&dd, base_origin);
364 if !value.is_empty() {
365 fields.push((label, value));
366 }
367 }
368 }
369 }
370 fields
371 .into_iter()
372 .map(|(k, v)| format!("{k}: {v}"))
373 .collect()
374}
375
376fn extract_inline(node: &NodeRef<Node>, base_origin: Option<&str>) -> String {
378 let mut parts: Vec<String> = Vec::new();
379 for child in node.children() {
380 match child.value() {
381 Node::Text(t) => {
382 let text = t.trim();
383 if !text.is_empty() {
384 parts.push(text.to_string());
385 }
386 }
387 Node::Element(el) => {
388 if el.name() == "a" {
389 let label = ElementRef::wrap(child)
390 .map(|e| e.text().collect::<String>())
391 .unwrap_or_default()
392 .trim()
393 .to_string();
394 let href = el
395 .attr("href")
396 .map(str::trim)
397 .filter(|h| !h.is_empty())
398 .map(|h| resolve_href(h, base_origin))
399 .unwrap_or_default();
400 if !label.is_empty() && !href.is_empty() {
401 parts.push(format!("{label} ({href})"));
402 } else if !label.is_empty() {
403 parts.push(label);
404 } else if !href.is_empty() {
405 parts.push(href);
406 }
407 } else {
408 let inline = extract_inline(&child, base_origin);
409 if !inline.is_empty() {
410 parts.push(inline);
411 }
412 }
413 }
414 _ => {}
415 }
416 }
417 parts
418 .join(" ")
419 .split_whitespace()
420 .collect::<Vec<_>>()
421 .join(" ")
422}
423
424fn normalize_label(raw: &str) -> String {
426 raw.trim().trim_end_matches(':').trim().to_string()
427}