Added Wayback Machine Search scraper

This commit is contained in:
Nicola Belluti 2024-09-14 14:58:31 +02:00
parent 868ad11719
commit f3d5424dae
5 changed files with 1397 additions and 2 deletions

1324
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,3 +4,8 @@ version = "0.1.0"
edition = "2021"
[dependencies]
rayon = "1.10.0"
reqwest = "0.12.7"
serde = { version = "1.0.210", features = ["derive"] }
serde_json = "1.0.128"
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }

View File

@ -13,6 +13,8 @@
devShells.${system}.default = pkgs.mkShell {
buildInputs = with pkgs; [
openssl
clippy
rustfmt

View File

@ -1,3 +1,14 @@
fn main() {
println!("Hello, world!");
mod wayback_machine;
use wayback_machine::Search;
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let records = Search::search("fermimn.gov.it").await;
for row in &records {
println!("{:#?}", row.get_wayback_machine_url());
}
println!("Length: {:#?}", records.len());
Ok(())
}

View File

@ -0,0 +1,53 @@
use rayon::prelude::*;
use reqwest::Client;
use std::collections::HashMap;
#[derive(Debug, serde::Deserialize, serde::Serialize)]
pub struct Search {
urlkey: String,
timestamp: String,
original: String,
mimetype: String,
statuscode: String,
digest: String,
length: String,
}
impl Search {
pub async fn search(url: &str) -> Vec<Self> {
let url = format!("https://web.archive.org/cdx/search/cdx?output=json&url={url}");
let response = Client::new()
.get(url)
.send()
.await
.unwrap()
.text()
.await
.unwrap();
let mut records: Vec<Vec<&str>> = serde_json::from_str(&response).unwrap();
let headers = records.remove(0);
records
.par_iter()
.map(|x| headers.iter().zip(x).collect::<HashMap<_, _>>())
.map(|x| Self {
urlkey: (**x.get(&"urlkey").unwrap()).to_string(),
timestamp: (**x.get(&"timestamp").unwrap()).to_string(),
original: (**x.get(&"original").unwrap()).to_string(),
mimetype: (**x.get(&"mimetype").unwrap()).to_string(),
statuscode: (**x.get(&"statuscode").unwrap()).to_string(),
digest: (**x.get(&"digest").unwrap()).to_string(),
length: (**x.get(&"length").unwrap()).to_string(),
})
.collect()
}
pub fn get_wayback_machine_url(&self) -> String {
format!(
"https://web.archive.org/web/{}/{}",
self.timestamp, self.original
)
}
}