2022-09-29 23:29:46 +00:00
|
|
|
//! Miscellaneous parsing utilities.
|
|
|
|
|
|
|
|
use std::str::FromStr;
|
|
|
|
|
|
|
|
use {
|
|
|
|
regex::Match,
|
|
|
|
scraper::{ElementRef, Selector},
|
|
|
|
};
|
|
|
|
|
2023-06-09 19:03:38 +00:00
|
|
|
use crate::{regexes::DUPLICATE_WHITESPACE_RE, ParseError};
|
2022-10-03 14:25:28 +00:00
|
|
|
|
|
|
|
/// Shorthand to extract the text and `href` values from an anchor element.
|
2023-06-09 19:03:38 +00:00
|
|
|
pub fn extract_anchor_values(
|
|
|
|
anchor: ElementRef,
|
|
|
|
) -> Result<(String, String), ParseError> {
|
2022-10-03 14:25:28 +00:00
|
|
|
let name = DUPLICATE_WHITESPACE_RE
|
|
|
|
.replace_all(&anchor.text().collect::<String>(), " ")
|
|
|
|
.trim()
|
|
|
|
.to_string();
|
2023-06-09 19:03:38 +00:00
|
|
|
let href = anchor
|
|
|
|
.value()
|
|
|
|
.attr("href")
|
|
|
|
.ok_or(ParseError::MissingExpectedHtml)?
|
|
|
|
.to_string();
|
2022-10-03 14:25:28 +00:00
|
|
|
|
2023-06-09 19:03:38 +00:00
|
|
|
Ok((name, href))
|
2022-10-03 14:25:28 +00:00
|
|
|
}
|
|
|
|
|
2022-09-29 23:29:46 +00:00
|
|
|
/// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`].
|
|
|
|
pub fn parse_regex_match<T: FromStr>(regex_match: Option<Match>) -> Option<T> {
|
|
|
|
regex_match.and_then(|regex_match| regex_match.as_str().parse::<T>().ok())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns the text of the first found element inside the given `parent`
|
|
|
|
/// element.
|
|
|
|
pub fn select_first_element_text(
|
|
|
|
parent: ElementRef,
|
|
|
|
selector: &Selector,
|
|
|
|
) -> Option<String> {
|
|
|
|
parent
|
|
|
|
.select(selector)
|
|
|
|
.next()
|
2022-10-03 13:31:26 +00:00
|
|
|
.map(|element| element.text().collect::<String>())
|
|
|
|
.map(|text| text.trim().to_string())
|
2022-09-29 23:29:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Shorthand for creating a [`scraper::Selector`].
|
|
|
|
pub fn selector(selector: &str) -> Selector {
|
|
|
|
Selector::parse(selector).unwrap()
|
|
|
|
}
|