From 773a075edf1b311007d7d2a4ce2270e015a66f09 Mon Sep 17 00:00:00 2001 From: Bauke Date: Wed, 21 Jun 2023 11:59:38 +0200 Subject: [PATCH] Add topic parsing. --- source/from_str.rs | 3 +- source/lib.rs | 2 + source/selectors.rs | 24 +++++++ source/topic.rs | 170 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 source/topic.rs diff --git a/source/from_str.rs b/source/from_str.rs index bdb25a7..b7835c9 100644 --- a/source/from_str.rs +++ b/source/from_str.rs @@ -4,12 +4,13 @@ use std::str::FromStr; use {duplicate::duplicate_item, scraper::Html}; -use crate::{Group, GroupList, ParseError}; +use crate::{Group, GroupList, ParseError, Topic}; #[duplicate_item( _Struct; [Group]; [GroupList]; + [Topic]; )] impl FromStr for _Struct { type Err = ParseError; diff --git a/source/lib.rs b/source/lib.rs index d023fc9..2b4f8da 100644 --- a/source/lib.rs +++ b/source/lib.rs @@ -28,9 +28,11 @@ pub(crate) mod error; pub(crate) mod from_str; pub(crate) mod group; pub(crate) mod group_list; +pub(crate) mod topic; pub use { error::*, group::{Group, GroupWikiLink}, group_list::{GroupList, GroupListSummary}, + topic::*, }; diff --git a/source/selectors.rs b/source/selectors.rs index 28e534e..16b24bb 100644 --- a/source/selectors.rs +++ b/source/selectors.rs @@ -28,4 +28,28 @@ lazy_static! { /// Selector for group wiki links. pub static ref GROUP_WIKI_LINKS: Selector = selector(r#"#sidebar [href*="/wiki/"]"#); + + /// Selector for the topic comment count. + pub static ref TOPIC_COMMENT_COUNT: Selector = selector(".topic-comments-header h2"); + + /// Selector for the topic full byline. + pub static ref TOPIC_FULL_BYLINE: Selector = selector(".topic-full-byline"); + + /// Selector for a link topic's content. + pub static ref TOPIC_FULL_LINK: Selector = selector(".topic-full-link a"); + + /// Selector for the topic tag elements. + pub static ref TOPIC_FULL_TAGS: Selector = selector(".topic-full-tags a"); + + /// Selector for a text topic's content. + pub static ref TOPIC_FULL_TEXT: Selector = selector(".topic-full-text"); + + /// Selector for the main topic `
`. + pub static ref TOPIC_MAIN_ARTICLE: Selector = selector("main > .topic-full"); + + /// Selector for a topic toast warning. + pub static ref TOPIC_TOAST_WARNING: Selector = selector(".toast.toast-warning"); + + /// Selector for the topic vote count. + pub static ref TOPIC_VOTE_COUNT: Selector = selector(".topic-voting-votes"); } diff --git a/source/topic.rs b/source/topic.rs new file mode 100644 index 0000000..80ae354 --- /dev/null +++ b/source/topic.rs @@ -0,0 +1,170 @@ +//! Parsing for `/~/`. + +use scraper::Html; + +use crate::{ + regexes::DUPLICATE_WHITESPACE_RE, + selectors::{ + TOPIC_COMMENT_COUNT, TOPIC_FULL_BYLINE, TOPIC_FULL_LINK, TOPIC_FULL_TAGS, + TOPIC_FULL_TEXT, TOPIC_MAIN_ARTICLE, TOPIC_TOAST_WARNING, TOPIC_VOTE_COUNT, + }, + utilities::select_first_element_text, + ParseError, +}; + +/// A Tildes topic. +#[derive(Debug)] +pub struct Topic { + /// The name of the author. + pub author: TopicAuthor, + + /// The amount of comments the topic has. Comments themselves have to be + /// parsed separately. + pub comment_total: i32, + + /// The content of the topic. + pub content: TopicContent, + + /// The unique ID of the topic. + pub id: String, + + /// Whether the topic is locked. + pub is_locked: bool, + + /// Whether the topic is official (not yet implemented, is always false). + /// + /// TODO: Add is_official. This isn't possible right now because topics don't + /// have any indicator of being marked as official. The only place it's shown + /// is in the topic listing. See #787 in the Tildes issue tracker. + pub is_official: bool, + + /// All tags applied to the topic. + pub tags: Vec, + + /// The amount of votes the topic has received. + pub vote_count: i32, +} + +/// All the different ways a topic author can be represented. +#[derive(Debug)] +pub enum TopicAuthor { + /// The normal case, where the topic author is available. + Name(String), + + /// The topic was posted by Tildes itself. + /// + /// Technically the user for this is [Tildes](https://tildes.net/user/tildes) + /// but in the topic it says "Automatically posted " where the username + /// normally goes, so may as well special-case it here too. + Scheduled, + + /// The user was banned, deleted their account or disassociated the topic from + /// their account. + Unknown, +} + +/// The different types of content a topic can have. +#[derive(Debug)] +pub enum TopicContent { + /// The topic is a link topic pointing to an external site. + Link(String), + + /// The topic is a text topic with a HTML body. + Text(String), + + /// The topic's content is no longer available. + Unknown, +} + +impl Topic { + /// Parses a [`Topic`] from a [`scraper::Html`] tree. + pub fn from_html(html: &Html) -> Result { + let topic_article_element = html + .select(&TOPIC_MAIN_ARTICLE) + .next() + .ok_or(ParseError::MissingExpectedHtml)?; + + let topic_byline = + select_first_element_text(topic_article_element, &TOPIC_FULL_BYLINE) + .map(|byline| { + DUPLICATE_WHITESPACE_RE + .replace_all(&byline, " ") + .to_string() + }) + .ok_or(ParseError::MissingExpectedHtml)?; + + let author = if topic_byline.starts_with("Automatically posted") { + TopicAuthor::Scheduled + } else if topic_byline.ends_with("unknown user") { + TopicAuthor::Unknown + } else { + TopicAuthor::Name( + topic_byline + .split(" ") + .last() + .ok_or(ParseError::MissingExpectedHtml)? + .to_string(), + ) + }; + + let comment_total = if let Some(comment_total) = + select_first_element_text(topic_article_element, &TOPIC_COMMENT_COUNT) + { + comment_total + .split(" ") + .next() + .map(|count| count.parse::()) + .ok_or(ParseError::MissingExpectedHtml)? + .map_err(|_| ParseError::MissingExpectedHtml)? + } else { + 0 + }; + + let content = if let Some(link_content) = + topic_article_element.select(&TOPIC_FULL_LINK).next() + { + TopicContent::Link(link_content.text().collect::()) + } else if let Some(text_content) = + topic_article_element.select(&TOPIC_FULL_TEXT).next() + { + TopicContent::Text(text_content.inner_html().trim().to_string()) + } else { + TopicContent::Unknown + }; + + let id = topic_article_element + .value() + .id() + .ok_or(ParseError::MissingExpectedHtml)?[6..] + .to_string(); + + let is_locked = + select_first_element_text(topic_article_element, &TOPIC_TOAST_WARNING) + .map(|toast| toast.contains("This topic is locked.")) + .unwrap_or_default(); + + let tags = topic_article_element + .select(&TOPIC_FULL_TAGS) + .map(|tag| tag.text().collect::()) + .collect::>(); + + let vote_count = + select_first_element_text(topic_article_element, &TOPIC_VOTE_COUNT) + .map(|vote_count| vote_count.parse::()) + .ok_or(ParseError::MissingExpectedHtml)? + .map_err(|_| ParseError::MissingExpectedHtml)?; + + let topic = Topic { + author, + comment_total, + content, + id, + is_locked, + is_official: false, // TODO: Implement this once it can be done. + tags, + vote_count, + }; + + Ok(topic) + } +}