Added Wayback Machine Search scraper
This commit is contained in:
parent
868ad11719
commit
f3d5424dae
1324
Cargo.lock
generated
1324
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -4,3 +4,8 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
rayon = "1.10.0"
|
||||
reqwest = "0.12.7"
|
||||
serde = { version = "1.0.210", features = ["derive"] }
|
||||
serde_json = "1.0.128"
|
||||
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }
|
||||
|
@ -13,6 +13,8 @@
|
||||
devShells.${system}.default = pkgs.mkShell {
|
||||
|
||||
buildInputs = with pkgs; [
|
||||
openssl
|
||||
|
||||
clippy
|
||||
rustfmt
|
||||
|
||||
|
15
src/main.rs
15
src/main.rs
@ -1,3 +1,14 @@
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
mod wayback_machine;
|
||||
use wayback_machine::Search;
|
||||
|
||||
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let records = Search::search("fermimn.gov.it").await;
|
||||
|
||||
for row in &records {
|
||||
println!("{:#?}", row.get_wayback_machine_url());
|
||||
}
|
||||
println!("Length: {:#?}", records.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
53
src/wayback_machine/mod.rs
Normal file
53
src/wayback_machine/mod.rs
Normal file
@ -0,0 +1,53 @@
|
||||
use rayon::prelude::*;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, serde::Deserialize, serde::Serialize)]
|
||||
pub struct Search {
|
||||
urlkey: String,
|
||||
timestamp: String,
|
||||
original: String,
|
||||
mimetype: String,
|
||||
statuscode: String,
|
||||
digest: String,
|
||||
length: String,
|
||||
}
|
||||
|
||||
impl Search {
|
||||
pub async fn search(url: &str) -> Vec<Self> {
|
||||
let url = format!("https://web.archive.org/cdx/search/cdx?output=json&url={url}");
|
||||
|
||||
let response = Client::new()
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.unwrap()
|
||||
.text()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut records: Vec<Vec<&str>> = serde_json::from_str(&response).unwrap();
|
||||
let headers = records.remove(0);
|
||||
|
||||
records
|
||||
.par_iter()
|
||||
.map(|x| headers.iter().zip(x).collect::<HashMap<_, _>>())
|
||||
.map(|x| Self {
|
||||
urlkey: (**x.get(&"urlkey").unwrap()).to_string(),
|
||||
timestamp: (**x.get(&"timestamp").unwrap()).to_string(),
|
||||
original: (**x.get(&"original").unwrap()).to_string(),
|
||||
mimetype: (**x.get(&"mimetype").unwrap()).to_string(),
|
||||
statuscode: (**x.get(&"statuscode").unwrap()).to_string(),
|
||||
digest: (**x.get(&"digest").unwrap()).to_string(),
|
||||
length: (**x.get(&"length").unwrap()).to_string(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn get_wayback_machine_url(&self) -> String {
|
||||
format!(
|
||||
"https://web.archive.org/web/{}/{}",
|
||||
self.timestamp, self.original
|
||||
)
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user