From 976cb8d85d43a94e3552c5a832ca70b070359f41 Mon Sep 17 00:00:00 2001 From: Bauke Date: Mon, 3 Oct 2022 16:25:28 +0200 Subject: [PATCH] Add group parsing. --- source/group.rs | 87 +++++++++++++++++++++++ source/lib.rs | 6 +- source/regexes.rs | 4 ++ source/selectors.rs | 15 ++++ source/utilities.rs | 13 ++++ tests/group.rs | 11 +++ tests/samples/group.html | 46 ++++++++++++ tests/snapshots/group__group_parsing.snap | 24 +++++++ 8 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 source/group.rs create mode 100644 tests/group.rs create mode 100644 tests/samples/group.html create mode 100644 tests/snapshots/group__group_parsing.snap diff --git a/source/group.rs b/source/group.rs new file mode 100644 index 0000000..144c1a8 --- /dev/null +++ b/source/group.rs @@ -0,0 +1,87 @@ +//! Parsing for `/~`. + +use {color_eyre::Result, scraper::Html}; + +use crate::{ + regexes::GROUP_SUBSCRIBERS_RE, + selectors::{ + GROUP_DESCRIPTION, GROUP_NAME, GROUP_SUBSCRIBERS, GROUP_SUB_GROUP_LINKS, + GROUP_WIKI_LINKS, + }, + utilities::{ + extract_anchor_values, parse_regex_match, select_first_element_text, + }, +}; + +/// A group's information. +#[derive(Debug)] +pub struct Group { + /// The group description. + pub description: Option, + + /// The group name, including leading tilde. + pub name: String, + + /// Names of sub-groups. + pub sub_groups: Vec, + + /// The amount of subscribers. + pub subscribers: i32, + + /// Links to wiki pages. + pub wiki_links: Vec, +} + +/// A group's wiki link. +#[derive(Debug)] +pub struct GroupWikiLink { + /// The name of the wiki page. + pub name: String, + + /// The URL to the wiki page. + pub url: String, +} + +impl Group { + /// Parses a [`Group`] from a [`scraper::Html`] tree. + pub fn from_html(html: &Html) -> Result { + let description = + select_first_element_text(html.root_element(), &GROUP_DESCRIPTION); + + let name = + select_first_element_text(html.root_element(), &GROUP_NAME).unwrap(); + + let subscribers = parse_regex_match( + GROUP_SUBSCRIBERS_RE + .captures_iter( + &select_first_element_text(html.root_element(), &GROUP_SUBSCRIBERS) + .unwrap(), + ) + .next() + .unwrap() + .name("count"), + ) + .unwrap(); + + let sub_groups = html + .select(&GROUP_SUB_GROUP_LINKS) + .map(|element| extract_anchor_values(element).0) + .collect(); + + let wiki_links = html + .select(&GROUP_WIKI_LINKS) + .map(|element| { + let (name, url) = extract_anchor_values(element); + GroupWikiLink { name, url } + }) + .collect(); + + Ok(Self { + description, + name, + sub_groups, + subscribers, + wiki_links, + }) + } +} diff --git a/source/lib.rs b/source/lib.rs index 707305e..2ff33ca 100644 --- a/source/lib.rs +++ b/source/lib.rs @@ -24,6 +24,10 @@ pub mod regexes; pub mod selectors; pub mod utilities; +pub(crate) mod group; pub(crate) mod group_list; -pub use group_list::{GroupList, GroupListSummary}; +pub use { + group::Group, + group_list::{GroupList, GroupListSummary}, +}; diff --git a/source/regexes.rs b/source/regexes.rs index e88f491..42f35d1 100644 --- a/source/regexes.rs +++ b/source/regexes.rs @@ -13,6 +13,10 @@ lazy_static! { /// ``` pub static ref DUPLICATE_WHITESPACE_RE: Regex = Regex::new(r"\s\s+").unwrap(); + /// Regular expression for extracting group subscriber count. + pub static ref GROUP_SUBSCRIBERS_RE: Regex = + Regex::new(r"(?P\d+) subscribers").unwrap(); + /// Regular expression for extracting group list activity text. pub static ref GROUP_LIST_ACTIVITY_RE: Regex = { Regex::new(concat!( diff --git a/source/selectors.rs b/source/selectors.rs index 5c453dd..dddf5f6 100644 --- a/source/selectors.rs +++ b/source/selectors.rs @@ -5,6 +5,9 @@ use {lazy_static::lazy_static, scraper::Selector}; use crate::utilities::selector; lazy_static! { + /// Selector for the group description. + pub static ref GROUP_DESCRIPTION: Selector = selector(".group-short-description"); + /// Selector for links to Tildes groups. pub static ref GROUP_LINK: Selector = selector(".link-group"); @@ -13,4 +16,16 @@ lazy_static! { /// Selector for the description section in group list items. pub static ref GROUP_LIST_DESCRIPTION: Selector = selector(".group-list-description"); + + /// Selector for the group name. + pub static ref GROUP_NAME: Selector = selector("#sidebar h3"); + + /// Selector for the group subscriber count. + pub static ref GROUP_SUBSCRIBERS: Selector = selector(".group-subscription-count"); + + /// Selector for group wiki links. + pub static ref GROUP_SUB_GROUP_LINKS: Selector = selector(r#"#sidebar .link-group"#); + + /// Selector for group wiki links. + pub static ref GROUP_WIKI_LINKS: Selector = selector(r#"#sidebar [href*="/wiki/"]"#); } diff --git a/source/utilities.rs b/source/utilities.rs index 156d76a..30210f4 100644 --- a/source/utilities.rs +++ b/source/utilities.rs @@ -7,6 +7,19 @@ use { scraper::{ElementRef, Selector}, }; +use crate::regexes::DUPLICATE_WHITESPACE_RE; + +/// Shorthand to extract the text and `href` values from an anchor element. +pub fn extract_anchor_values(anchor: ElementRef) -> (String, String) { + let name = DUPLICATE_WHITESPACE_RE + .replace_all(&anchor.text().collect::(), " ") + .trim() + .to_string(); + let href = anchor.value().attr("href").unwrap().to_string(); + + (name, href) +} + /// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`]. pub fn parse_regex_match(regex_match: Option) -> Option { regex_match.and_then(|regex_match| regex_match.as_str().parse::().ok()) diff --git a/tests/group.rs b/tests/group.rs new file mode 100644 index 0000000..8d9ce4f --- /dev/null +++ b/tests/group.rs @@ -0,0 +1,11 @@ +use std::fs::read_to_string; + +use {insta::assert_debug_snapshot, scraper::Html, tildes_parser::Group}; + +#[test] +fn test_group_parsing() { + let html = read_to_string("tests/samples/group.html").unwrap(); + let html = Html::parse_document(&html); + let group = Group::from_html(&html).unwrap(); + assert_debug_snapshot!(group); +} diff --git a/tests/samples/group.html b/tests/samples/group.html new file mode 100644 index 0000000..8fcef9d --- /dev/null +++ b/tests/samples/group.html @@ -0,0 +1,46 @@ + + + + + Sample for group.rs + + + + + + + diff --git a/tests/snapshots/group__group_parsing.snap b/tests/snapshots/group__group_parsing.snap new file mode 100644 index 0000000..2240711 --- /dev/null +++ b/tests/snapshots/group__group_parsing.snap @@ -0,0 +1,24 @@ +--- +source: tests/group.rs +expression: group +--- +Group { + description: Some( + "Group description.", + ), + name: "~group", + sub_groups: [ + "~example.sub", + ], + subscribers: 12345, + wiki_links: [ + GroupWikiLink { + name: "index", + url: "https://example.org/~example/wiki/index", + }, + GroupWikiLink { + name: "Example Page", + url: "https://example.org/~example/wiki/example_page", + }, + ], +}