Add group parsing.
This commit is contained in:
		
							parent
							
								
									457d8329ee
								
							
						
					
					
						commit
						976cb8d85d
					
				| 
						 | 
					@ -0,0 +1,87 @@
 | 
				
			||||||
 | 
					//! Parsing for `/~<group>`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use {color_eyre::Result, scraper::Html};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use crate::{
 | 
				
			||||||
 | 
					  regexes::GROUP_SUBSCRIBERS_RE,
 | 
				
			||||||
 | 
					  selectors::{
 | 
				
			||||||
 | 
					    GROUP_DESCRIPTION, GROUP_NAME, GROUP_SUBSCRIBERS, GROUP_SUB_GROUP_LINKS,
 | 
				
			||||||
 | 
					    GROUP_WIKI_LINKS,
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  utilities::{
 | 
				
			||||||
 | 
					    extract_anchor_values, parse_regex_match, select_first_element_text,
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// A group's information.
 | 
				
			||||||
 | 
					#[derive(Debug)]
 | 
				
			||||||
 | 
					pub struct Group {
 | 
				
			||||||
 | 
					  /// The group description.
 | 
				
			||||||
 | 
					  pub description: Option<String>,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// The group name, including leading tilde.
 | 
				
			||||||
 | 
					  pub name: String,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Names of sub-groups.
 | 
				
			||||||
 | 
					  pub sub_groups: Vec<String>,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// The amount of subscribers.
 | 
				
			||||||
 | 
					  pub subscribers: i32,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Links to wiki pages.
 | 
				
			||||||
 | 
					  pub wiki_links: Vec<GroupWikiLink>,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// A group's wiki link.
 | 
				
			||||||
 | 
					#[derive(Debug)]
 | 
				
			||||||
 | 
					pub struct GroupWikiLink {
 | 
				
			||||||
 | 
					  /// The name of the wiki page.
 | 
				
			||||||
 | 
					  pub name: String,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// The URL to the wiki page.
 | 
				
			||||||
 | 
					  pub url: String,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl Group {
 | 
				
			||||||
 | 
					  /// Parses a [`Group`] from a [`scraper::Html`] tree.
 | 
				
			||||||
 | 
					  pub fn from_html(html: &Html) -> Result<Self> {
 | 
				
			||||||
 | 
					    let description =
 | 
				
			||||||
 | 
					      select_first_element_text(html.root_element(), &GROUP_DESCRIPTION);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let name =
 | 
				
			||||||
 | 
					      select_first_element_text(html.root_element(), &GROUP_NAME).unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let subscribers = parse_regex_match(
 | 
				
			||||||
 | 
					      GROUP_SUBSCRIBERS_RE
 | 
				
			||||||
 | 
					        .captures_iter(
 | 
				
			||||||
 | 
					          &select_first_element_text(html.root_element(), &GROUP_SUBSCRIBERS)
 | 
				
			||||||
 | 
					            .unwrap(),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        .next()
 | 
				
			||||||
 | 
					        .unwrap()
 | 
				
			||||||
 | 
					        .name("count"),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    .unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let sub_groups = html
 | 
				
			||||||
 | 
					      .select(&GROUP_SUB_GROUP_LINKS)
 | 
				
			||||||
 | 
					      .map(|element| extract_anchor_values(element).0)
 | 
				
			||||||
 | 
					      .collect();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let wiki_links = html
 | 
				
			||||||
 | 
					      .select(&GROUP_WIKI_LINKS)
 | 
				
			||||||
 | 
					      .map(|element| {
 | 
				
			||||||
 | 
					        let (name, url) = extract_anchor_values(element);
 | 
				
			||||||
 | 
					        GroupWikiLink { name, url }
 | 
				
			||||||
 | 
					      })
 | 
				
			||||||
 | 
					      .collect();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Ok(Self {
 | 
				
			||||||
 | 
					      description,
 | 
				
			||||||
 | 
					      name,
 | 
				
			||||||
 | 
					      sub_groups,
 | 
				
			||||||
 | 
					      subscribers,
 | 
				
			||||||
 | 
					      wiki_links,
 | 
				
			||||||
 | 
					    })
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -24,6 +24,10 @@ pub mod regexes;
 | 
				
			||||||
pub mod selectors;
 | 
					pub mod selectors;
 | 
				
			||||||
pub mod utilities;
 | 
					pub mod utilities;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pub(crate) mod group;
 | 
				
			||||||
pub(crate) mod group_list;
 | 
					pub(crate) mod group_list;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub use group_list::{GroupList, GroupListSummary};
 | 
					pub use {
 | 
				
			||||||
 | 
					  group::Group,
 | 
				
			||||||
 | 
					  group_list::{GroupList, GroupListSummary},
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,6 +13,10 @@ lazy_static! {
 | 
				
			||||||
  /// ```
 | 
					  /// ```
 | 
				
			||||||
  pub static ref DUPLICATE_WHITESPACE_RE: Regex = Regex::new(r"\s\s+").unwrap();
 | 
					  pub static ref DUPLICATE_WHITESPACE_RE: Regex = Regex::new(r"\s\s+").unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Regular expression for extracting group subscriber count.
 | 
				
			||||||
 | 
					  pub static ref GROUP_SUBSCRIBERS_RE: Regex =
 | 
				
			||||||
 | 
					    Regex::new(r"(?P<count>\d+) subscribers").unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /// Regular expression for extracting group list activity text.
 | 
					  /// Regular expression for extracting group list activity text.
 | 
				
			||||||
  pub static ref GROUP_LIST_ACTIVITY_RE: Regex = {
 | 
					  pub static ref GROUP_LIST_ACTIVITY_RE: Regex = {
 | 
				
			||||||
    Regex::new(concat!(
 | 
					    Regex::new(concat!(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,6 +5,9 @@ use {lazy_static::lazy_static, scraper::Selector};
 | 
				
			||||||
use crate::utilities::selector;
 | 
					use crate::utilities::selector;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
lazy_static! {
 | 
					lazy_static! {
 | 
				
			||||||
 | 
					  /// Selector for the group description.
 | 
				
			||||||
 | 
					  pub static ref GROUP_DESCRIPTION: Selector = selector(".group-short-description");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /// Selector for links to Tildes groups.
 | 
					  /// Selector for links to Tildes groups.
 | 
				
			||||||
  pub static ref GROUP_LINK: Selector = selector(".link-group");
 | 
					  pub static ref GROUP_LINK: Selector = selector(".link-group");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,4 +16,16 @@ lazy_static! {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /// Selector for the description section in group list items.
 | 
					  /// Selector for the description section in group list items.
 | 
				
			||||||
  pub static ref GROUP_LIST_DESCRIPTION: Selector = selector(".group-list-description");
 | 
					  pub static ref GROUP_LIST_DESCRIPTION: Selector = selector(".group-list-description");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Selector for the group name.
 | 
				
			||||||
 | 
					  pub static ref GROUP_NAME: Selector = selector("#sidebar h3");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Selector for the group subscriber count.
 | 
				
			||||||
 | 
					  pub static ref GROUP_SUBSCRIBERS: Selector = selector(".group-subscription-count");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Selector for group wiki links.
 | 
				
			||||||
 | 
					  pub static ref GROUP_SUB_GROUP_LINKS: Selector = selector(r#"#sidebar .link-group"#);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Selector for group wiki links.
 | 
				
			||||||
 | 
					  pub static ref GROUP_WIKI_LINKS: Selector = selector(r#"#sidebar [href*="/wiki/"]"#);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,19 @@ use {
 | 
				
			||||||
  scraper::{ElementRef, Selector},
 | 
					  scraper::{ElementRef, Selector},
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use crate::regexes::DUPLICATE_WHITESPACE_RE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// Shorthand to extract the text and `href` values from an anchor element.
 | 
				
			||||||
 | 
					pub fn extract_anchor_values(anchor: ElementRef) -> (String, String) {
 | 
				
			||||||
 | 
					  let name = DUPLICATE_WHITESPACE_RE
 | 
				
			||||||
 | 
					    .replace_all(&anchor.text().collect::<String>(), " ")
 | 
				
			||||||
 | 
					    .trim()
 | 
				
			||||||
 | 
					    .to_string();
 | 
				
			||||||
 | 
					  let href = anchor.value().attr("href").unwrap().to_string();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  (name, href)
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`].
 | 
					/// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`].
 | 
				
			||||||
pub fn parse_regex_match<T: FromStr>(regex_match: Option<Match>) -> Option<T> {
 | 
					pub fn parse_regex_match<T: FromStr>(regex_match: Option<Match>) -> Option<T> {
 | 
				
			||||||
  regex_match.and_then(|regex_match| regex_match.as_str().parse::<T>().ok())
 | 
					  regex_match.and_then(|regex_match| regex_match.as_str().parse::<T>().ok())
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,11 @@
 | 
				
			||||||
 | 
					use std::fs::read_to_string;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use {insta::assert_debug_snapshot, scraper::Html, tildes_parser::Group};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[test]
 | 
				
			||||||
 | 
					fn test_group_parsing() {
 | 
				
			||||||
 | 
					  let html = read_to_string("tests/samples/group.html").unwrap();
 | 
				
			||||||
 | 
					  let html = Html::parse_document(&html);
 | 
				
			||||||
 | 
					  let group = Group::from_html(&html).unwrap();
 | 
				
			||||||
 | 
					  assert_debug_snapshot!(group);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,46 @@
 | 
				
			||||||
 | 
					<!DOCTYPE html>
 | 
				
			||||||
 | 
					<html lang="en">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<head>
 | 
				
			||||||
 | 
					  <title>Sample for group.rs</title>
 | 
				
			||||||
 | 
					</head>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<body>
 | 
				
			||||||
 | 
					  <aside id="sidebar">
 | 
				
			||||||
 | 
					    <h3>~group</h3>
 | 
				
			||||||
 | 
					    <div class="group-short-description">Group description.</div>
 | 
				
			||||||
 | 
					    <div class="group-subscription">
 | 
				
			||||||
 | 
					      <span class="group-subscription-count">12345 subscribers</span>
 | 
				
			||||||
 | 
					    </div>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <ul class="nav">
 | 
				
			||||||
 | 
					      <li>Subgroups</li>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      <ul class="nav">
 | 
				
			||||||
 | 
					        <li class="nav-item">
 | 
				
			||||||
 | 
					          <a href="/~example.sub" class="link-group">~example.sub</a>
 | 
				
			||||||
 | 
					        </li>
 | 
				
			||||||
 | 
					      </ul>
 | 
				
			||||||
 | 
					    </ul>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <ul class="nav">
 | 
				
			||||||
 | 
					      <li>Group wiki pages</li>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      <ul class="nav">
 | 
				
			||||||
 | 
					        <li class="nav-item">
 | 
				
			||||||
 | 
					          <a href="https://example.org/~example/wiki/index" class="text-bold">
 | 
				
			||||||
 | 
					            index
 | 
				
			||||||
 | 
					          </a>
 | 
				
			||||||
 | 
					        </li>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        <li class="nav-item">
 | 
				
			||||||
 | 
					          <a href="https://example.org/~example/wiki/example_page">
 | 
				
			||||||
 | 
					            Example Page
 | 
				
			||||||
 | 
					          </a>
 | 
				
			||||||
 | 
					        </li>
 | 
				
			||||||
 | 
					      </ul>
 | 
				
			||||||
 | 
					    </ul>
 | 
				
			||||||
 | 
					  </aside>
 | 
				
			||||||
 | 
					</body>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</html>
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,24 @@
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					source: tests/group.rs
 | 
				
			||||||
 | 
					expression: group
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					Group {
 | 
				
			||||||
 | 
					    description: Some(
 | 
				
			||||||
 | 
					        "Group description.",
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					    name: "~group",
 | 
				
			||||||
 | 
					    sub_groups: [
 | 
				
			||||||
 | 
					        "~example.sub",
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    subscribers: 12345,
 | 
				
			||||||
 | 
					    wiki_links: [
 | 
				
			||||||
 | 
					        GroupWikiLink {
 | 
				
			||||||
 | 
					            name: "index",
 | 
				
			||||||
 | 
					            url: "https://example.org/~example/wiki/index",
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        GroupWikiLink {
 | 
				
			||||||
 | 
					            name: "Example Page",
 | 
				
			||||||
 | 
					            url: "https://example.org/~example/wiki/example_page",
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue