1
Fork 0

Compare commits

...

5 Commits

11 changed files with 1493 additions and 266 deletions

25
.gitignore vendored
View File

@ -1,23 +1,8 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Compiled files and executables.
/target/
# Test binary, built with `go test -c`
*.test
# Backup files generated by rustfmt.
**/*.rs.bk
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Dependency directories (remove the comment below to include it)
# vendor/
### Go Patch ###
/vendor/
/Godeps/
# Sitemap files
previous-sitemap.md
# The actual Sitemap, to be copied to https://tildes.net/~tildes/wiki/sitemap.
sitemap.md

1347
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

18
Cargo.toml Normal file
View File

@ -0,0 +1,18 @@
# https://doc.rust-lang.org/cargo/reference/manifest.html
[package]
name = "tildes-wiki-sitemap"
version = "0.1.0"
authors = ["Bauke <me@bauke.xyz>"]
edition = "2018"
[[bin]]
name = "tildes-wiki-sitemap"
path = "source/main.rs"
[dependencies]
scraper = "0.12.0"
[dependencies.reqwest]
version = "0.10.7"
features = ["blocking"]

View File

@ -1,33 +1,38 @@
<img src="images/tildes-wiki-sitemap.png" align="right">
# Tildes Wiki Sitemap
> Generates a `tree`-like [sitemap](https://tildes.net/~tildes/wiki/sitemap) of all group wiki pages of Tildes.net
> Generates a Markdown file with all group wiki pages of Tildes.
---
## Installation
### Binary
Precompiled binaries are available [here](https://git.holllo.cc/Bauke/tildes-wiki-sitemap/releases).
### Source
Requires [Rust and Cargo](https://www.rust-lang.org/tools/install) to be installed.
```sh
git clone https://git.holllo.cc/Bauke/tildes-wiki-sitemap.git
cd tildes-wiki-sitemap
cargo build --release
mv target/release/tildes-wiki-sitemap ./
```
## Usage
Precombiled binaries are not provided, if you can't or don't know how to run this program and the sitemap is outdated please [PM me (@Bauke)](https://tildes.net/user/Bauke/new_message) and I'll update it.
### Quick Command
Only tested on `go1.12.1 linux/amd64`, if something breaks please [create an issue](https://gitlab.com/Bauke/tildes-wiki-sitemap/issues/new).
Make sure the file is executable, then run it. A `sitemap.md` file will be created with the results.
```sh
$ git clone https://gitlab.com/Bauke/tildes-wiki-sitemap.git
cd tildes-wiki-sitemap
go run .
edit sitemap.md
chmod +x ./tildes-wiki-sitemap
./tildes-wiki-sitemap
less sitemap.md
```
## Attributes
## Previous Version
- [Colly](https://github.com/gocolly/colly) (Apache License 2.0)
- [Logrus](https://github.com/Sirupsen/logrus) (MIT License)
If you're looking for the previous version of this program written in Go, [click here](https://git.holllo.cc/Bauke/tildes-wiki-sitemap/src/commit/18a96e9d541fd1e231574ceec4d4bdf5783e3b5f) to go to the commit before the Rust rewrite.
## License
Licensed under [AGPL-3.0-or-later](LICENSE).
![AGPL-3.0-or-later Logo](images/license.png)
Open-sourced with the [AGPL-3.0-or-later license](https://git.holllo.cc/Bauke/tildes-wiki-sitemap/src/branch/main/LICENSE).

23
go.mod
View File

@ -1,23 +0,0 @@
module gitlab.com/bauke/tildes-wiki-sitemap
go 1.12
require (
github.com/PuerkitoBio/goquery v1.5.0 // indirect
github.com/antchfx/htmlquery v1.0.0 // indirect
github.com/antchfx/xmlquery v1.0.0 // indirect
github.com/antchfx/xpath v1.0.0 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gocolly/colly v1.2.0
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/konsorten/go-windows-terminal-sequences v1.0.2 // indirect
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
github.com/sirupsen/logrus v1.4.2
github.com/stretchr/objx v0.2.0 // indirect
github.com/temoto/robotstxt v0.0.0-20180810133444-97ee4a9ee6ea // indirect
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 // indirect
golang.org/x/net v0.0.0-20190628185345-da137c7871d7 // indirect
golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb // indirect
golang.org/x/tools v0.0.0-20190701194522-38ae2c8f6412 // indirect
google.golang.org/appengine v1.6.1 // indirect
)

60
go.sum
View File

@ -1,60 +0,0 @@
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/antchfx/htmlquery v1.0.0 h1:O5IXz8fZF3B3MW+B33MZWbTHBlYmcfw0BAxgErHuaMA=
github.com/antchfx/htmlquery v1.0.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/xmlquery v1.0.0 h1:YuEPqexGG2opZKNc9JU3Zw6zFXwC47wNcy6/F8oKsrM=
github.com/antchfx/xmlquery v1.0.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xpath v1.0.0 h1:Q5gFgh2O40VTSwMOVbFE7nFNRBu3tS21Tn0KAWeEjtk=
github.com/antchfx/xpath v1.0.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v0.0.0-20180810133444-97ee4a9ee6ea h1:hH8P1IiDpzRU6ZDbDh/RDnVuezi2oOXJpApa06M0zyI=
github.com/temoto/robotstxt v0.0.0-20180810133444-97ee4a9ee6ea/go.mod h1:aOux3gHPCftJ3KHq6Pz/AlDjYJ7Y+yKfm1gU/3B0u04=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190628185345-da137c7871d7 h1:rTIdg5QFRR7XCaK4LCjBiPbx8j4DQRpdYMnGn/bJUEU=
golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb h1:fgwFCsaw9buMuxNd6+DQfAuSFqbNiQZpcgJQAgJsK6k=
golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
golang.org/x/tools v0.0.0-20190701194522-38ae2c8f6412/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI=
google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I=
google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.4 KiB

144
main.go
View File

@ -1,144 +0,0 @@
package main
import (
"fmt"
"io/ioutil"
"os"
"sort"
"strings"
"time"
"github.com/gocolly/colly"
log "github.com/sirupsen/logrus"
)
type wikiPage struct {
name string
url string
}
func main() {
log.Infof("Tildes Wiki Sitemap\n")
// Create a variable we'll use to check if a current sitemap already exists
_, sitemapExists := os.Stat("sitemap.md")
if sitemapExists == nil {
// If it does exist, rename it to "previous"
os.Rename("sitemap.md", "previous-sitemap.md")
}
// Create a groups array, which will be used to sort the output
// (couldn't figure out how to sort a map by keys)
groups := make([]string, 0)
// Create a map for the pages where the key will be the group name
pages := make(map[string][]wikiPage)
// Create a new collector that's only allowed to visit Tildes.net
collector := colly.NewCollector(colly.AllowedDomains("tildes.net"))
// When receiving HTML:
collector.OnHTML("html", func(page *colly.HTMLElement) {
time.Sleep(time.Second)
// Define the URL for brevity
url := page.Request.URL.String()
if strings.HasSuffix(url, "/groups") {
// If the URL ends with /groups we want to visit each group found in the table
page.ForEach("td>.link-group", func(_ int, element *colly.HTMLElement) {
log.Printf("Visiting group %s", element.Text)
groups = append(groups, element.Text)
// Make the pages for all groups start out as an empty array
// This makes it so groups without any wiki pages also get added to the output, instead of being skipped
pages[element.Text] = make([]wikiPage, 0)
collector.Visit(fmt.Sprintf("https://tildes.net/%s", element.Text))
})
} else if strings.Contains(url, "~") {
// Else if the URL has a tilde in it, we want to extract the wiki pages
group := url[strings.LastIndex(url, "/")+1:]
page.ForEach(".nav>.nav-item>a", func(_ int, element *colly.HTMLElement) {
// Append the new page to the array and set its name and URL
pages[group] = append(pages[group], wikiPage{
name: element.Text,
url: element.Attr("href"),
})
log.Printf("Found wiki page: %s/%s", group, element.Text)
})
}
})
// After defining the OnHTML callback, visit the group listing
// This won't finish until all groups have been visited inside and the entire callback is done
collector.Visit("https://tildes.net/groups")
// Create the sitemap.md file
file, _ := os.Create("sitemap.md")
defer file.Close()
// Write the boilerplate stuff first
file.WriteString("# Tildes Wiki Sitemap\n\n")
file.WriteString("Automatically generated by [this program](https://gitlab.com/bauke/tildes-wiki-sitemap). [PM @Bauke](https://tildes.net/user/Bauke/new_message) if this page is outdated and you can't run the program yourself.\n\n")
file.WriteString("This page is a temporary placeholder to help wiki contributors navigate. Find this page easily by bookmarking it!\n\n")
// Sort the groups
sort.Strings(groups)
// Define a variable that will keep track of the pages count
pagesTotal := 0
// Write a . to indicate the start of the tree
file.WriteString(".\n")
// Iterate over the group names
for index, group := range groups {
prefix := ""
if index == len(groups)-1 {
// If we're at the last group, write the "L" piece instead
file.WriteString(fmt.Sprintf("└── [%s](https://tildes.net/%s/wiki)\n", group[1:], group))
// And because there doesn't have to be an extra vertical piece here, we need to preserve the leading space
// So we set the prefix to 2 non-breaking spaces, Markdown won't remove these
prefix = "&nbsp;&nbsp;"
} else {
// Else we're not at the last group yet, so we wanna use the rotated "T" piece
file.WriteString(fmt.Sprintf("├── [%s](https://tildes.net/%s/wiki)\n", group[1:], group))
prefix = "│ "
}
for pageIndex, page := range pages[group] {
// Increment the total pages
pagesTotal++
// Write the prefix determined earlier
file.WriteString(prefix)
if pageIndex == len(pages[group])-1 {
// Same reasoning as with the groups, if we're on the last page use the "L" piece
file.WriteString("└")
} else {
// Otherwise the rotated "T"
file.WriteString("├")
}
// And then write the Markdown link
file.WriteString(fmt.Sprintf("── [%s](%s)\n", page.name, page.url))
}
}
// And finally write how many groups and pages there are, like `tree` writes directories and files
file.WriteString(fmt.Sprintf("\n%v groups, %v pages\n", len(groups), pagesTotal))
// If the sitemap exists we want to read both sitemaps and check if they're the same
if sitemapExists == nil {
file, err := os.Open("sitemap.md")
if err != nil {
log.Fatal(err)
}
current, _ := ioutil.ReadAll(file)
file.Close()
file, err = os.Open("previous-sitemap.md")
if err != nil {
log.Fatal(err)
}
previous, _ := ioutil.ReadAll(file)
file.Close()
// If they're the same just log that all is good, if not warn that we need to update
if string(current) == string(previous) {
log.Infof("Current and previous sitemaps are the same, no need to update.\n")
} else {
log.Warnf("Current and previous sitemaps are not the same, you should update it.\n")
}
}
log.Printf("Done! Found %v groups and %v pages, see sitemap.md for the output", len(groups), pagesTotal)
}

2
rustfmt.toml Normal file
View File

@ -0,0 +1,2 @@
max_width = 80
tab_spaces = 2

97
source/main.rs Normal file
View File

@ -0,0 +1,97 @@
use std::{error::Error, fs, thread, time::Duration};
use reqwest::blocking::Client;
use scraper::{ElementRef, Html, Selector};
fn main() -> Result<(), Box<dyn Error>> {
let client = Client::builder()
.user_agent("Tildes Wiki Sitemap")
.build()?;
// Get the HTML from the groups list.
let response = client.get("https://tildes.net/groups").send()?;
let body = response.text()?;
// Parse the HTML.
let html = Html::parse_document(&body);
// Create a selector to grab all anchors that link to a group.
let selector = Selector::parse(".group-list .link-group").unwrap();
// Get all the group link elements from the HTML.
let group_links = html.select(&selector).collect::<Vec<ElementRef>>();
// Create the sitemap with the info.
let mut sitemap = "# Tildes Wiki Sitemap\n\n".to_string();
sitemap += "Automatically generated by \
[this program](https://git.holllo.cc/Bauke/tildes-wiki-sitemap). \
[message @Bauke](https://tildes.net/user/Bauke/new_message?subject=Tildes%20Wiki%20Sitemap\
&message=Update%20the%20sitemap%20you%20doofus!) if this page is outdated and \
you can't run the program yourself.\n\n\
This page is a temporary placeholder to help wiki contributors navigate. \
Find this page easily by bookmarking it!\n\n";
for group_link in group_links {
// Get the group name without the tilde.
let group_name = group_link.inner_html()[1..].to_string();
println!("┌ Processing ~{}!", group_name);
// Get the HTML from the group page.
let response = client
.get(&format!("https://tildes.net/~{}", group_name))
.send()?;
let body = response.text()?;
// Parse the HTML.
let html = Html::parse_document(&body);
// Create a selector to grab all the anchors in the sidebar that lead to a wiki page.
let selector =
Selector::parse("#sidebar .nav a[href*=\"/wiki/\"]").unwrap();
// Get all the wiki URL elements from the HTML.
let wiki_links = html.select(&selector).collect::<Vec<ElementRef>>();
let wiki_links_amount = wiki_links.len();
// Create a selector to grab the group description.
let selector =
Selector::parse("#sidebar .group-short-description").unwrap();
// Get the group description from the HTML.
let group_description = html.select(&selector).collect::<Vec<ElementRef>>();
// Add the group as a new header.
sitemap += format!("## ~{}\n\n", group_name).as_str();
// If a group description is found, add it to the Markdown.
if !group_description.is_empty() {
let description = group_description.first().unwrap().inner_html();
sitemap += format!("> {}\n\n", description).as_str();
}
// If there's no wiki pages, add a little blurb with a link to create one.
if wiki_links.is_empty() {
sitemap += format!("There are no wiki pages for ~{} yet, \
[click here and be the first to create one](https://tildes.net/~{}/wiki/new_page), \
if you were granted the necessary permission to do so!\n", group_name, group_name).as_str();
}
// Loop over the links and add them in a list.
for wiki_link in wiki_links {
let wiki_page_title = wiki_link.inner_html();
let wiki_page_link = wiki_link.value().attr("href").unwrap_or("");
sitemap += &format!("* [{}]({})\n", wiki_page_title, wiki_page_link);
}
sitemap += "\n";
println!("└ Processed {} wiki links.", wiki_links_amount);
// Sleep 500ms between HTTP requests.
thread::sleep(Duration::from_millis(500));
}
sitemap = sitemap.trim_end().to_string() + "\n";
fs::write("./sitemap.md", sitemap)?;
println!("✓ Done!");
Ok(())
}