Add group parsing.
This commit is contained in:
parent
457d8329ee
commit
976cb8d85d
|
@ -0,0 +1,87 @@
|
||||||
|
//! Parsing for `/~<group>`.
|
||||||
|
|
||||||
|
use {color_eyre::Result, scraper::Html};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
regexes::GROUP_SUBSCRIBERS_RE,
|
||||||
|
selectors::{
|
||||||
|
GROUP_DESCRIPTION, GROUP_NAME, GROUP_SUBSCRIBERS, GROUP_SUB_GROUP_LINKS,
|
||||||
|
GROUP_WIKI_LINKS,
|
||||||
|
},
|
||||||
|
utilities::{
|
||||||
|
extract_anchor_values, parse_regex_match, select_first_element_text,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// A group's information.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Group {
|
||||||
|
/// The group description.
|
||||||
|
pub description: Option<String>,
|
||||||
|
|
||||||
|
/// The group name, including leading tilde.
|
||||||
|
pub name: String,
|
||||||
|
|
||||||
|
/// Names of sub-groups.
|
||||||
|
pub sub_groups: Vec<String>,
|
||||||
|
|
||||||
|
/// The amount of subscribers.
|
||||||
|
pub subscribers: i32,
|
||||||
|
|
||||||
|
/// Links to wiki pages.
|
||||||
|
pub wiki_links: Vec<GroupWikiLink>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A group's wiki link.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct GroupWikiLink {
|
||||||
|
/// The name of the wiki page.
|
||||||
|
pub name: String,
|
||||||
|
|
||||||
|
/// The URL to the wiki page.
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Group {
|
||||||
|
/// Parses a [`Group`] from a [`scraper::Html`] tree.
|
||||||
|
pub fn from_html(html: &Html) -> Result<Self> {
|
||||||
|
let description =
|
||||||
|
select_first_element_text(html.root_element(), &GROUP_DESCRIPTION);
|
||||||
|
|
||||||
|
let name =
|
||||||
|
select_first_element_text(html.root_element(), &GROUP_NAME).unwrap();
|
||||||
|
|
||||||
|
let subscribers = parse_regex_match(
|
||||||
|
GROUP_SUBSCRIBERS_RE
|
||||||
|
.captures_iter(
|
||||||
|
&select_first_element_text(html.root_element(), &GROUP_SUBSCRIBERS)
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.name("count"),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let sub_groups = html
|
||||||
|
.select(&GROUP_SUB_GROUP_LINKS)
|
||||||
|
.map(|element| extract_anchor_values(element).0)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let wiki_links = html
|
||||||
|
.select(&GROUP_WIKI_LINKS)
|
||||||
|
.map(|element| {
|
||||||
|
let (name, url) = extract_anchor_values(element);
|
||||||
|
GroupWikiLink { name, url }
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
description,
|
||||||
|
name,
|
||||||
|
sub_groups,
|
||||||
|
subscribers,
|
||||||
|
wiki_links,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
|
@ -24,6 +24,10 @@ pub mod regexes;
|
||||||
pub mod selectors;
|
pub mod selectors;
|
||||||
pub mod utilities;
|
pub mod utilities;
|
||||||
|
|
||||||
|
pub(crate) mod group;
|
||||||
pub(crate) mod group_list;
|
pub(crate) mod group_list;
|
||||||
|
|
||||||
pub use group_list::{GroupList, GroupListSummary};
|
pub use {
|
||||||
|
group::Group,
|
||||||
|
group_list::{GroupList, GroupListSummary},
|
||||||
|
};
|
||||||
|
|
|
@ -13,6 +13,10 @@ lazy_static! {
|
||||||
/// ```
|
/// ```
|
||||||
pub static ref DUPLICATE_WHITESPACE_RE: Regex = Regex::new(r"\s\s+").unwrap();
|
pub static ref DUPLICATE_WHITESPACE_RE: Regex = Regex::new(r"\s\s+").unwrap();
|
||||||
|
|
||||||
|
/// Regular expression for extracting group subscriber count.
|
||||||
|
pub static ref GROUP_SUBSCRIBERS_RE: Regex =
|
||||||
|
Regex::new(r"(?P<count>\d+) subscribers").unwrap();
|
||||||
|
|
||||||
/// Regular expression for extracting group list activity text.
|
/// Regular expression for extracting group list activity text.
|
||||||
pub static ref GROUP_LIST_ACTIVITY_RE: Regex = {
|
pub static ref GROUP_LIST_ACTIVITY_RE: Regex = {
|
||||||
Regex::new(concat!(
|
Regex::new(concat!(
|
||||||
|
|
|
@ -5,6 +5,9 @@ use {lazy_static::lazy_static, scraper::Selector};
|
||||||
use crate::utilities::selector;
|
use crate::utilities::selector;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
|
/// Selector for the group description.
|
||||||
|
pub static ref GROUP_DESCRIPTION: Selector = selector(".group-short-description");
|
||||||
|
|
||||||
/// Selector for links to Tildes groups.
|
/// Selector for links to Tildes groups.
|
||||||
pub static ref GROUP_LINK: Selector = selector(".link-group");
|
pub static ref GROUP_LINK: Selector = selector(".link-group");
|
||||||
|
|
||||||
|
@ -13,4 +16,16 @@ lazy_static! {
|
||||||
|
|
||||||
/// Selector for the description section in group list items.
|
/// Selector for the description section in group list items.
|
||||||
pub static ref GROUP_LIST_DESCRIPTION: Selector = selector(".group-list-description");
|
pub static ref GROUP_LIST_DESCRIPTION: Selector = selector(".group-list-description");
|
||||||
|
|
||||||
|
/// Selector for the group name.
|
||||||
|
pub static ref GROUP_NAME: Selector = selector("#sidebar h3");
|
||||||
|
|
||||||
|
/// Selector for the group subscriber count.
|
||||||
|
pub static ref GROUP_SUBSCRIBERS: Selector = selector(".group-subscription-count");
|
||||||
|
|
||||||
|
/// Selector for group wiki links.
|
||||||
|
pub static ref GROUP_SUB_GROUP_LINKS: Selector = selector(r#"#sidebar .link-group"#);
|
||||||
|
|
||||||
|
/// Selector for group wiki links.
|
||||||
|
pub static ref GROUP_WIKI_LINKS: Selector = selector(r#"#sidebar [href*="/wiki/"]"#);
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,6 +7,19 @@ use {
|
||||||
scraper::{ElementRef, Selector},
|
scraper::{ElementRef, Selector},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use crate::regexes::DUPLICATE_WHITESPACE_RE;
|
||||||
|
|
||||||
|
/// Shorthand to extract the text and `href` values from an anchor element.
|
||||||
|
pub fn extract_anchor_values(anchor: ElementRef) -> (String, String) {
|
||||||
|
let name = DUPLICATE_WHITESPACE_RE
|
||||||
|
.replace_all(&anchor.text().collect::<String>(), " ")
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
let href = anchor.value().attr("href").unwrap().to_string();
|
||||||
|
|
||||||
|
(name, href)
|
||||||
|
}
|
||||||
|
|
||||||
/// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`].
|
/// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`].
|
||||||
pub fn parse_regex_match<T: FromStr>(regex_match: Option<Match>) -> Option<T> {
|
pub fn parse_regex_match<T: FromStr>(regex_match: Option<Match>) -> Option<T> {
|
||||||
regex_match.and_then(|regex_match| regex_match.as_str().parse::<T>().ok())
|
regex_match.and_then(|regex_match| regex_match.as_str().parse::<T>().ok())
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
use std::fs::read_to_string;
|
||||||
|
|
||||||
|
use {insta::assert_debug_snapshot, scraper::Html, tildes_parser::Group};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_group_parsing() {
|
||||||
|
let html = read_to_string("tests/samples/group.html").unwrap();
|
||||||
|
let html = Html::parse_document(&html);
|
||||||
|
let group = Group::from_html(&html).unwrap();
|
||||||
|
assert_debug_snapshot!(group);
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<title>Sample for group.rs</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<aside id="sidebar">
|
||||||
|
<h3>~group</h3>
|
||||||
|
<div class="group-short-description">Group description.</div>
|
||||||
|
<div class="group-subscription">
|
||||||
|
<span class="group-subscription-count">12345 subscribers</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<ul class="nav">
|
||||||
|
<li>Subgroups</li>
|
||||||
|
|
||||||
|
<ul class="nav">
|
||||||
|
<li class="nav-item">
|
||||||
|
<a href="/~example.sub" class="link-group">~example.sub</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<ul class="nav">
|
||||||
|
<li>Group wiki pages</li>
|
||||||
|
|
||||||
|
<ul class="nav">
|
||||||
|
<li class="nav-item">
|
||||||
|
<a href="https://example.org/~example/wiki/index" class="text-bold">
|
||||||
|
index
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
|
||||||
|
<li class="nav-item">
|
||||||
|
<a href="https://example.org/~example/wiki/example_page">
|
||||||
|
Example Page
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</ul>
|
||||||
|
</aside>
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
|
@ -0,0 +1,24 @@
|
||||||
|
---
|
||||||
|
source: tests/group.rs
|
||||||
|
expression: group
|
||||||
|
---
|
||||||
|
Group {
|
||||||
|
description: Some(
|
||||||
|
"Group description.",
|
||||||
|
),
|
||||||
|
name: "~group",
|
||||||
|
sub_groups: [
|
||||||
|
"~example.sub",
|
||||||
|
],
|
||||||
|
subscribers: 12345,
|
||||||
|
wiki_links: [
|
||||||
|
GroupWikiLink {
|
||||||
|
name: "index",
|
||||||
|
url: "https://example.org/~example/wiki/index",
|
||||||
|
},
|
||||||
|
GroupWikiLink {
|
||||||
|
name: "Example Page",
|
||||||
|
url: "https://example.org/~example/wiki/example_page",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
Loading…
Reference in New Issue