tildes-parser/source/topic.rs

205 lines
5.6 KiB
Rust
Raw Normal View History

2023-06-21 09:59:38 +00:00
//! Parsing for `/~<group>/<topic-id>`.
2023-06-23 09:00:02 +00:00
use {
chrono::{DateTime, FixedOffset},
scraper::Html,
};
2023-06-21 09:59:38 +00:00
use crate::{
regexes::DUPLICATE_WHITESPACE_RE,
selectors::{
2023-06-23 09:00:02 +00:00
SITE_HEADER_CONTEXT, TIME_WITH_DATETIME, TOPIC_COMMENT_COUNT,
TOPIC_FULL_BYLINE, TOPIC_FULL_LINK, TOPIC_FULL_TAGS, TOPIC_FULL_TEXT,
TOPIC_MAIN_ARTICLE, TOPIC_TOAST_WARNING, TOPIC_VOTE_COUNT,
2023-06-21 09:59:38 +00:00
},
utilities::select_first_element_text,
ParseError,
};
/// A Tildes topic.
#[derive(Debug)]
pub struct Topic {
/// The name of the author.
pub author: TopicAuthor,
/// The amount of comments the topic has. Comments themselves have to be
/// parsed separately.
pub comment_total: i32,
/// The content of the topic.
pub content: TopicContent,
2023-06-22 10:55:35 +00:00
/// The group the topic was posted in, with a leading tilde character.
pub group: String,
2023-06-21 09:59:38 +00:00
/// The unique ID of the topic.
pub id: String,
/// Whether the topic is locked.
pub is_locked: bool,
/// Whether the topic is official (not yet implemented, is always false).
///
/// TODO: Add is_official. This isn't possible right now because topics don't
/// have any indicator of being marked as official. The only place it's shown
/// is in the topic listing. See #787 in the Tildes issue tracker.
pub is_official: bool,
2023-06-23 09:00:02 +00:00
/// The date the topic was posted.
pub posted_date: DateTime<FixedOffset>,
2023-06-21 09:59:38 +00:00
/// All tags applied to the topic.
pub tags: Vec<String>,
/// The amount of votes the topic has received.
pub vote_count: i32,
}
/// All the different ways a topic author can be represented.
#[derive(Debug)]
pub enum TopicAuthor {
/// The normal case, where the topic author is available.
Name(String),
/// The topic was posted by Tildes itself.
///
/// Technically the user for this is [Tildes](https://tildes.net/user/tildes)
2024-02-25 12:12:20 +00:00
/// but in the topic it says "Automatically posted &lt;date&gt;" where the username
2023-06-21 09:59:38 +00:00
/// normally goes, so may as well special-case it here too.
Scheduled,
/// The user was banned, deleted their account or disassociated the topic from
/// their account.
Unknown,
}
/// The different types of content a topic can have.
#[derive(Debug)]
pub enum TopicContent {
/// The topic is a link topic pointing to an external site.
Link(String),
/// The topic is a text topic with a HTML body.
Text(String),
/// The topic's content is no longer available.
Unknown,
}
impl Topic {
/// Parses a [`Topic`] from a [`scraper::Html`] tree.
pub fn from_html(html: &Html) -> Result<Self, ParseError> {
let topic_article_element = html
.select(&TOPIC_MAIN_ARTICLE)
.next()
.ok_or(ParseError::MissingExpectedHtml)?;
let topic_byline =
select_first_element_text(topic_article_element, &TOPIC_FULL_BYLINE)
.map(|byline| {
DUPLICATE_WHITESPACE_RE
.replace_all(&byline, " ")
.to_string()
})
.ok_or(ParseError::MissingExpectedHtml)?;
let author = if topic_byline.starts_with("Automatically posted") {
TopicAuthor::Scheduled
} else if topic_byline.ends_with("unknown user") {
TopicAuthor::Unknown
} else {
TopicAuthor::Name(
topic_byline
2024-02-25 12:12:20 +00:00
.split(' ')
2023-06-21 09:59:38 +00:00
.last()
.ok_or(ParseError::MissingExpectedHtml)?
.to_string(),
)
};
let comment_total = if let Some(comment_total) =
select_first_element_text(topic_article_element, &TOPIC_COMMENT_COUNT)
{
comment_total
2024-02-25 12:12:20 +00:00
.split(' ')
2023-06-21 09:59:38 +00:00
.next()
.map(|count| count.parse::<i32>())
.ok_or(ParseError::MissingExpectedHtml)?
.map_err(|_| ParseError::MissingExpectedHtml)?
} else {
0
};
let content = if let Some(link_content) =
topic_article_element.select(&TOPIC_FULL_LINK).next()
{
TopicContent::Link(link_content.text().collect::<String>())
} else if let Some(text_content) =
topic_article_element.select(&TOPIC_FULL_TEXT).next()
{
TopicContent::Text(text_content.inner_html().trim().to_string())
} else {
TopicContent::Unknown
};
2023-06-22 10:55:35 +00:00
let group = DUPLICATE_WHITESPACE_RE
.replace_all(
html
.select(&SITE_HEADER_CONTEXT)
.next()
.ok_or(ParseError::MissingExpectedHtml)?
.text()
.collect::<String>()
.trim(),
"",
)
.to_string();
2024-02-25 12:12:20 +00:00
assert!(group.starts_with('~'));
2023-06-22 10:55:35 +00:00
2023-06-21 09:59:38 +00:00
let id = topic_article_element
.value()
.id()
.ok_or(ParseError::MissingExpectedHtml)?[6..]
.to_string();
let is_locked =
select_first_element_text(topic_article_element, &TOPIC_TOAST_WARNING)
.map(|toast| toast.contains("This topic is locked."))
.unwrap_or_default();
2023-06-23 09:00:02 +00:00
let posted_date = topic_article_element
.select(&TOPIC_FULL_BYLINE)
.next()
.and_then(|byline| byline.select(&TIME_WITH_DATETIME).next())
.and_then(|time| time.value().attr("datetime"))
.and_then(|datetime| DateTime::parse_from_rfc3339(datetime).ok())
.ok_or(ParseError::MissingExpectedHtml)?;
2023-06-21 09:59:38 +00:00
let tags = topic_article_element
.select(&TOPIC_FULL_TAGS)
.map(|tag| tag.text().collect::<String>())
.collect::<Vec<_>>();
let vote_count =
select_first_element_text(topic_article_element, &TOPIC_VOTE_COUNT)
.map(|vote_count| vote_count.parse::<i32>())
.ok_or(ParseError::MissingExpectedHtml)?
.map_err(|_| ParseError::MissingExpectedHtml)?;
let topic = Topic {
author,
comment_total,
content,
2023-06-22 10:55:35 +00:00
group,
2023-06-21 09:59:38 +00:00
id,
is_locked,
is_official: false, // TODO: Implement this once it can be done.
2023-06-23 09:00:02 +00:00
posted_date,
2023-06-21 09:59:38 +00:00
tags,
vote_count,
};
Ok(topic)
}
}