2022-09-23 15:40:30 +00:00
|
|
|
// Copyright (C) 2022 Bauke <me@bauke.xyz>
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify it under
|
|
|
|
// the terms of the GNU Affero General Public License as published by the Free
|
|
|
|
// Software Foundation, either version 3 of the License, or (at your option) any
|
|
|
|
// later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
|
|
|
// details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
2022-09-23 15:55:40 +00:00
|
|
|
//! # Select HTML
|
2022-09-05 12:24:49 +00:00
|
|
|
//!
|
2022-09-23 15:40:30 +00:00
|
|
|
//! > **Extract HTML using CSS selectors in the command-line.**
|
2022-09-05 12:24:49 +00:00
|
|
|
|
|
|
|
#![forbid(unsafe_code)]
|
|
|
|
#![warn(missing_docs, clippy::missing_docs_in_private_items)]
|
|
|
|
|
|
|
|
use std::{
|
|
|
|
fs::File,
|
|
|
|
io::{stdin, Read},
|
|
|
|
path::PathBuf,
|
|
|
|
};
|
|
|
|
|
|
|
|
use {
|
|
|
|
clap::Parser,
|
|
|
|
color_eyre::{eyre::eyre, install, Result},
|
|
|
|
scraper::{Html, Selector},
|
|
|
|
};
|
|
|
|
|
|
|
|
/// CLI arguments struct using [`clap`]'s Derive API.
|
|
|
|
#[derive(Debug, Parser)]
|
|
|
|
#[clap(about, author, version)]
|
|
|
|
pub struct Args {
|
|
|
|
/// Output the attribute's value from the selected element, can be used
|
|
|
|
/// multiple times.
|
|
|
|
#[clap(short, long, group = "output")]
|
|
|
|
pub attribute: Vec<String>,
|
|
|
|
|
|
|
|
/// A HTML file to read, if not specified stdin will be used instead.
|
|
|
|
#[clap(long, parse(from_os_str))]
|
|
|
|
pub file: Option<PathBuf>,
|
|
|
|
|
|
|
|
/// The CSS selector to use.
|
|
|
|
pub selector: String,
|
|
|
|
|
|
|
|
/// Output inner text of the selected elements.
|
|
|
|
#[clap(short, long, group = "output")]
|
|
|
|
pub text: bool,
|
|
|
|
|
|
|
|
/// Trim whitespace from selected items.
|
|
|
|
#[clap(long)]
|
|
|
|
pub trim: bool,
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The main CLI function.
|
|
|
|
fn main() -> Result<()> {
|
|
|
|
install()?;
|
|
|
|
|
|
|
|
let args = Args::parse();
|
|
|
|
|
|
|
|
let selector = Selector::parse(&args.selector)
|
|
|
|
.map_err(|_| eyre!("Failed to parse selector"))?;
|
|
|
|
|
|
|
|
let document = {
|
|
|
|
let mut html = String::new();
|
|
|
|
|
|
|
|
if let Some(path) = args.file {
|
|
|
|
File::open(path)?.read_to_string(&mut html)?;
|
|
|
|
} else {
|
|
|
|
stdin().read_to_string(&mut html)?;
|
|
|
|
};
|
|
|
|
|
|
|
|
Html::parse_document(&html)
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut to_print = vec![];
|
|
|
|
|
|
|
|
for element in document.select(&selector) {
|
|
|
|
if args.text {
|
|
|
|
to_print.push(element.text().collect::<String>());
|
|
|
|
} else if !args.attribute.is_empty() {
|
|
|
|
let element = element.value();
|
|
|
|
for attribute in &args.attribute {
|
|
|
|
if let Some(value) = element.attr(attribute) {
|
|
|
|
to_print.push(value.to_string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
to_print.push(element.html());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for value in to_print {
|
|
|
|
if args.trim {
|
|
|
|
println!("{}", value.trim());
|
|
|
|
} else {
|
|
|
|
println!("{}", value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|