Added Wayback Machine Search scraper
This commit is contained in:
15
src/main.rs
15
src/main.rs
@@ -1,3 +1,14 @@
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
mod wayback_machine;
|
||||
use wayback_machine::Search;
|
||||
|
||||
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let records = Search::search("fermimn.gov.it").await;
|
||||
|
||||
for row in &records {
|
||||
println!("{:#?}", row.get_wayback_machine_url());
|
||||
}
|
||||
println!("Length: {:#?}", records.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
53
src/wayback_machine/mod.rs
Normal file
53
src/wayback_machine/mod.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use rayon::prelude::*;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, serde::Deserialize, serde::Serialize)]
|
||||
pub struct Search {
|
||||
urlkey: String,
|
||||
timestamp: String,
|
||||
original: String,
|
||||
mimetype: String,
|
||||
statuscode: String,
|
||||
digest: String,
|
||||
length: String,
|
||||
}
|
||||
|
||||
impl Search {
|
||||
pub async fn search(url: &str) -> Vec<Self> {
|
||||
let url = format!("https://web.archive.org/cdx/search/cdx?output=json&url={url}");
|
||||
|
||||
let response = Client::new()
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.unwrap()
|
||||
.text()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut records: Vec<Vec<&str>> = serde_json::from_str(&response).unwrap();
|
||||
let headers = records.remove(0);
|
||||
|
||||
records
|
||||
.par_iter()
|
||||
.map(|x| headers.iter().zip(x).collect::<HashMap<_, _>>())
|
||||
.map(|x| Self {
|
||||
urlkey: (**x.get(&"urlkey").unwrap()).to_string(),
|
||||
timestamp: (**x.get(&"timestamp").unwrap()).to_string(),
|
||||
original: (**x.get(&"original").unwrap()).to_string(),
|
||||
mimetype: (**x.get(&"mimetype").unwrap()).to_string(),
|
||||
statuscode: (**x.get(&"statuscode").unwrap()).to_string(),
|
||||
digest: (**x.get(&"digest").unwrap()).to_string(),
|
||||
length: (**x.get(&"length").unwrap()).to_string(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn get_wayback_machine_url(&self) -> String {
|
||||
format!(
|
||||
"https://web.archive.org/web/{}/{}",
|
||||
self.timestamp, self.original
|
||||
)
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user