diff --git a/Cargo.lock b/Cargo.lock index 4a9085d..8d8a2cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -102,6 +102,16 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "num-traits", + "serde", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -239,12 +249,14 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" name = "fermi-news-scraper" version = "0.1.0" dependencies = [ + "chrono", "rayon", "reqwest", "scraper", "serde", "serde_json", "tokio", + "toml", ] [[package]] @@ -664,6 +676,15 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "object" version = "0.36.4" @@ -1200,6 +1221,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -1443,6 +1473,40 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tower" version = "0.4.13" @@ -1761,6 +1825,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f" +dependencies = [ + "memchr", +] + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index aa33b1b..25509ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,9 +4,11 @@ version = "0.1.0" edition = "2021" [dependencies] +chrono = { version = "0.4.38", features = ["serde"], default-features = false } rayon = "1.10.0" reqwest = "0.12.7" scraper = "0.20.0" serde = { version = "1.0.210", features = ["derive"] } serde_json = "1.0.128" tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] } +toml = "0.8.19" diff --git a/src/main.rs b/src/main.rs index 6907931..fd2fce6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,11 +5,15 @@ use news_scraper::News; #[tokio::main(flavor = "multi_thread", worker_threads = 10)] async fn main() -> Result<(), Box> { - let news = News::extract_news("https://web.archive.org/web/20171214172259/http://www.fermimn.gov.it/news/index.php?pageno=3") + let news = News::extract_news("https://web.archive.org/web/20130609011109/http://www.fermimn.gov.it/news/index.php?pageno=7") .await; - for element in news { - println!("{element:#?}"); + for (i, element) in news.iter().enumerate() { + println!( + "[[news.{}]]\n{}", + i, + toml::to_string_pretty(element).unwrap() + ); } Ok(()) diff --git a/src/news_scraper/mod.rs b/src/news_scraper/mod.rs index 3da35aa..774470a 100644 --- a/src/news_scraper/mod.rs +++ b/src/news_scraper/mod.rs @@ -1,5 +1,6 @@ -mod scrape_20140506; +mod scrape_20091228; +use chrono::NaiveDate; use reqwest::Client; use scraper::Html; @@ -9,6 +10,8 @@ pub struct News { title: String, image_url: Option, article_url: Option, + body: String, + date: NaiveDate, } impl News { @@ -25,6 +28,8 @@ impl News { let document = Html::parse_document(&html); // TODO: Select by date - scrape_20140506::scrape(&document) + let news: Vec = scrape_20091228::scrape(&document); + + news } } diff --git a/src/news_scraper/scrape_20140506.rs b/src/news_scraper/scrape_20091228.rs similarity index 79% rename from src/news_scraper/scrape_20140506.rs rename to src/news_scraper/scrape_20091228.rs index 1bb324f..05b4532 100644 --- a/src/news_scraper/scrape_20140506.rs +++ b/src/news_scraper/scrape_20091228.rs @@ -1,4 +1,5 @@ use super::News; +use chrono::NaiveDate; use scraper::{ElementRef, Html, Selector}; pub fn scrape(document: &Html) -> Vec { @@ -6,6 +7,7 @@ pub fn scrape(document: &Html) -> Vec { document .select(&selector) + .rev() .filter(|x| { // Filter out every div with the "bnewtesto" class that doesn't start // with an h2 of class "titolini" @@ -22,12 +24,15 @@ pub fn scrape(document: &Html) -> Vec { .map(|news_element| { let (id, title) = extract_id_and_title(&news_element); let (image_url, article_url) = extract_img_and_article(&news_element); + let (body, date) = extract_body_datetime_and_materials(&news_element); News { id, title, image_url, article_url, + body, + date, } }) .collect::>() @@ -102,3 +107,24 @@ fn extract_img_and_article(news_element: &ElementRef) -> (Option, Option (image_url, article_url) } + +fn extract_body_datetime_and_materials(news_element: &ElementRef) -> (String, NaiveDate) { + let body = news_element + .child_elements() + .find(|x| x.value().name() == "p") + .unwrap(); + + let datetime = body.child_elements().last().unwrap(); + let datetime = + NaiveDate::parse_from_str(&datetime.inner_html(), "(notizia del %d-%m-%Y)").unwrap(); + + // news_element + // .child_elements() + // .filter(|x| x.value().name() == "p") + // .skip(1) + // .for_each(|x| println!("{:?}", x)); + + // println!("---\n\n"); + + (body.inner_html(), datetime) +}