Add group parsing.

2022-10-03 16:25:28 +02:00 · 2022-10-03 16:25:28 +02:00 · 976cb8d85d
parent 457d8329ee
commit 976cb8d85d
8 changed files with 205 additions and 1 deletions
--- a/source/group.rs
+++ b/source/group.rs
@ -0,0 +1,87 @@
+//! Parsing for `/~<group>`.
+
+use {color_eyre::Result, scraper::Html};
+
+use crate::{
+  regexes::GROUP_SUBSCRIBERS_RE,
+  selectors::{
+    GROUP_DESCRIPTION, GROUP_NAME, GROUP_SUBSCRIBERS, GROUP_SUB_GROUP_LINKS,
+    GROUP_WIKI_LINKS,
+  },
+  utilities::{
+    extract_anchor_values, parse_regex_match, select_first_element_text,
+  },
+};
+
+/// A group's information.
+#[derive(Debug)]
+pub struct Group {
+  /// The group description.
+  pub description: Option<String>,
+
+  /// The group name, including leading tilde.
+  pub name: String,
+
+  /// Names of sub-groups.
+  pub sub_groups: Vec<String>,
+
+  /// The amount of subscribers.
+  pub subscribers: i32,
+
+  /// Links to wiki pages.
+  pub wiki_links: Vec<GroupWikiLink>,
+}
+
+/// A group's wiki link.
+#[derive(Debug)]
+pub struct GroupWikiLink {
+  /// The name of the wiki page.
+  pub name: String,
+
+  /// The URL to the wiki page.
+  pub url: String,
+}
+
+impl Group {
+  /// Parses a [`Group`] from a [`scraper::Html`] tree.
+  pub fn from_html(html: &Html) -> Result<Self> {
+    let description =
+      select_first_element_text(html.root_element(), &GROUP_DESCRIPTION);
+
+    let name =
+      select_first_element_text(html.root_element(), &GROUP_NAME).unwrap();
+
+    let subscribers = parse_regex_match(
+      GROUP_SUBSCRIBERS_RE
+        .captures_iter(
+          &select_first_element_text(html.root_element(), &GROUP_SUBSCRIBERS)
+            .unwrap(),
+        )
+        .next()
+        .unwrap()
+        .name("count"),
+    )
+    .unwrap();
+
+    let sub_groups = html
+      .select(&GROUP_SUB_GROUP_LINKS)
+      .map(|element| extract_anchor_values(element).0)
+      .collect();
+
+    let wiki_links = html
+      .select(&GROUP_WIKI_LINKS)
+      .map(|element| {
+        let (name, url) = extract_anchor_values(element);
+        GroupWikiLink { name, url }
+      })
+      .collect();
+
+    Ok(Self {
+      description,
+      name,
+      sub_groups,
+      subscribers,
+      wiki_links,
+    })
+  }
+}
--- a/source/lib.rs
+++ b/source/lib.rs
@ -24,6 +24,10 @@ pub mod regexes;
 pub mod selectors;
 pub mod utilities;

+pub(crate) mod group;
 pub(crate) mod group_list;

-pub use group_list::{GroupList, GroupListSummary};
+pub use {
+  group::Group,
+  group_list::{GroupList, GroupListSummary},
+};
--- a/source/regexes.rs
+++ b/source/regexes.rs
@ -13,6 +13,10 @@ lazy_static! {
  /// ```
  pub static ref DUPLICATE_WHITESPACE_RE: Regex = Regex::new(r"\s\s+").unwrap();

+  /// Regular expression for extracting group subscriber count.
+  pub static ref GROUP_SUBSCRIBERS_RE: Regex =
+    Regex::new(r"(?P<count>\d+) subscribers").unwrap();
+
  /// Regular expression for extracting group list activity text.
  pub static ref GROUP_LIST_ACTIVITY_RE: Regex = {
    Regex::new(concat!(
--- a/source/selectors.rs
+++ b/source/selectors.rs
@ -5,6 +5,9 @@ use {lazy_static::lazy_static, scraper::Selector};
 use crate::utilities::selector;

 lazy_static! {
+  /// Selector for the group description.
+  pub static ref GROUP_DESCRIPTION: Selector = selector(".group-short-description");
+
  /// Selector for links to Tildes groups.
  pub static ref GROUP_LINK: Selector = selector(".link-group");

@ -13,4 +16,16 @@ lazy_static! {

  /// Selector for the description section in group list items.
  pub static ref GROUP_LIST_DESCRIPTION: Selector = selector(".group-list-description");
+
+  /// Selector for the group name.
+  pub static ref GROUP_NAME: Selector = selector("#sidebar h3");
+
+  /// Selector for the group subscriber count.
+  pub static ref GROUP_SUBSCRIBERS: Selector = selector(".group-subscription-count");
+
+  /// Selector for group wiki links.
+  pub static ref GROUP_SUB_GROUP_LINKS: Selector = selector(r#"#sidebar .link-group"#);
+
+  /// Selector for group wiki links.
+  pub static ref GROUP_WIKI_LINKS: Selector = selector(r#"#sidebar [href*="/wiki/"]"#);
 }
--- a/source/utilities.rs
+++ b/source/utilities.rs
@ -7,6 +7,19 @@ use {
  scraper::{ElementRef, Selector},
 };

+use crate::regexes::DUPLICATE_WHITESPACE_RE;
+
+/// Shorthand to extract the text and `href` values from an anchor element.
+pub fn extract_anchor_values(anchor: ElementRef) -> (String, String) {
+  let name = DUPLICATE_WHITESPACE_RE
+    .replace_all(&anchor.text().collect::<String>(), " ")
+    .trim()
+    .to_string();
+  let href = anchor.value().attr("href").unwrap().to_string();
+
+  (name, href)
+}
+
 /// Shorthand to parse a [`regex::Match`] with [`std::str::FromStr`].
 pub fn parse_regex_match<T: FromStr>(regex_match: Option<Match>) -> Option<T> {
  regex_match.and_then(|regex_match| regex_match.as_str().parse::<T>().ok())
--- a/tests/group.rs
+++ b/tests/group.rs
@ -0,0 +1,11 @@
+use std::fs::read_to_string;
+
+use {insta::assert_debug_snapshot, scraper::Html, tildes_parser::Group};
+
+#[test]
+fn test_group_parsing() {
+  let html = read_to_string("tests/samples/group.html").unwrap();
+  let html = Html::parse_document(&html);
+  let group = Group::from_html(&html).unwrap();
+  assert_debug_snapshot!(group);
+}
--- a/tests/samples/group.html
+++ b/tests/samples/group.html
@ -0,0 +1,46 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+  <title>Sample for group.rs</title>
+</head>
+
+<body>
+  <aside id="sidebar">
+    <h3>~group</h3>
+    <div class="group-short-description">Group description.</div>
+    <div class="group-subscription">
+      <span class="group-subscription-count">12345 subscribers</span>
+    </div>
+
+    <ul class="nav">
+      <li>Subgroups</li>
+
+      <ul class="nav">
+        <li class="nav-item">
+          <a href="/~example.sub" class="link-group">~example.sub</a>
+        </li>
+      </ul>
+    </ul>
+
+    <ul class="nav">
+      <li>Group wiki pages</li>
+
+      <ul class="nav">
+        <li class="nav-item">
+          <a href="https://example.org/~example/wiki/index" class="text-bold">
+            index
+          </a>
+        </li>
+
+        <li class="nav-item">
+          <a href="https://example.org/~example/wiki/example_page">
+            Example Page
+          </a>
+        </li>
+      </ul>
+    </ul>
+  </aside>
+</body>
+
+</html>
--- a/tests/snapshots/group__group_parsing.snap
+++ b/tests/snapshots/group__group_parsing.snap
@ -0,0 +1,24 @@
+---
+source: tests/group.rs
+expression: group
+---
+Group {
+    description: Some(
+        "Group description.",
+    ),
+    name: "~group",
+    sub_groups: [
+        "~example.sub",
+    ],
+    subscribers: 12345,
+    wiki_links: [
+        GroupWikiLink {
+            name: "index",
+            url: "https://example.org/~example/wiki/index",
+        },
+        GroupWikiLink {
+            name: "Example Page",
+            url: "https://example.org/~example/wiki/example_page",
+        },
+    ],
+}