Added Wayback Machine Search scraper
This commit is contained in:
parent
868ad11719
commit
f3d5424dae
1324
Cargo.lock
generated
1324
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -4,3 +4,8 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
rayon = "1.10.0"
|
||||||
|
reqwest = "0.12.7"
|
||||||
|
serde = { version = "1.0.210", features = ["derive"] }
|
||||||
|
serde_json = "1.0.128"
|
||||||
|
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
devShells.${system}.default = pkgs.mkShell {
|
devShells.${system}.default = pkgs.mkShell {
|
||||||
|
|
||||||
buildInputs = with pkgs; [
|
buildInputs = with pkgs; [
|
||||||
|
openssl
|
||||||
|
|
||||||
clippy
|
clippy
|
||||||
rustfmt
|
rustfmt
|
||||||
|
|
||||||
|
15
src/main.rs
15
src/main.rs
@ -1,3 +1,14 @@
|
|||||||
fn main() {
|
mod wayback_machine;
|
||||||
println!("Hello, world!");
|
use wayback_machine::Search;
|
||||||
|
|
||||||
|
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
||||||
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let records = Search::search("fermimn.gov.it").await;
|
||||||
|
|
||||||
|
for row in &records {
|
||||||
|
println!("{:#?}", row.get_wayback_machine_url());
|
||||||
|
}
|
||||||
|
println!("Length: {:#?}", records.len());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
53
src/wayback_machine/mod.rs
Normal file
53
src/wayback_machine/mod.rs
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
use rayon::prelude::*;
|
||||||
|
use reqwest::Client;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
#[derive(Debug, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct Search {
|
||||||
|
urlkey: String,
|
||||||
|
timestamp: String,
|
||||||
|
original: String,
|
||||||
|
mimetype: String,
|
||||||
|
statuscode: String,
|
||||||
|
digest: String,
|
||||||
|
length: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Search {
|
||||||
|
pub async fn search(url: &str) -> Vec<Self> {
|
||||||
|
let url = format!("https://web.archive.org/cdx/search/cdx?output=json&url={url}");
|
||||||
|
|
||||||
|
let response = Client::new()
|
||||||
|
.get(url)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut records: Vec<Vec<&str>> = serde_json::from_str(&response).unwrap();
|
||||||
|
let headers = records.remove(0);
|
||||||
|
|
||||||
|
records
|
||||||
|
.par_iter()
|
||||||
|
.map(|x| headers.iter().zip(x).collect::<HashMap<_, _>>())
|
||||||
|
.map(|x| Self {
|
||||||
|
urlkey: (**x.get(&"urlkey").unwrap()).to_string(),
|
||||||
|
timestamp: (**x.get(&"timestamp").unwrap()).to_string(),
|
||||||
|
original: (**x.get(&"original").unwrap()).to_string(),
|
||||||
|
mimetype: (**x.get(&"mimetype").unwrap()).to_string(),
|
||||||
|
statuscode: (**x.get(&"statuscode").unwrap()).to_string(),
|
||||||
|
digest: (**x.get(&"digest").unwrap()).to_string(),
|
||||||
|
length: (**x.get(&"length").unwrap()).to_string(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_wayback_machine_url(&self) -> String {
|
||||||
|
format!(
|
||||||
|
"https://web.archive.org/web/{}/{}",
|
||||||
|
self.timestamp, self.original
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user