Added full body and date

This commit is contained in:
Nicola Belluti 2024-09-17 11:48:51 +02:00
parent bca68fba0d
commit 8fb3d99a95
5 changed files with 115 additions and 5 deletions

73
Cargo.lock generated
View File

@ -102,6 +102,16 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
dependencies = [
"num-traits",
"serde",
]
[[package]]
name = "core-foundation"
version = "0.9.4"
@ -239,12 +249,14 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
name = "fermi-news-scraper"
version = "0.1.0"
dependencies = [
"chrono",
"rayon",
"reqwest",
"scraper",
"serde",
"serde_json",
"tokio",
"toml",
]
[[package]]
@ -664,6 +676,15 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "object"
version = "0.36.4"
@ -1200,6 +1221,15 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_spanned"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d"
dependencies = [
"serde",
]
[[package]]
name = "serde_urlencoded"
version = "0.7.1"
@ -1443,6 +1473,40 @@ dependencies = [
"tokio",
]
[[package]]
name = "toml"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
]
[[package]]
name = "toml_datetime"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.22.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf"
dependencies = [
"indexmap",
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
]
[[package]]
name = "tower"
version = "0.4.13"
@ -1761,6 +1825,15 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f"
dependencies = [
"memchr",
]
[[package]]
name = "zerocopy"
version = "0.7.35"

View File

@ -4,9 +4,11 @@ version = "0.1.0"
edition = "2021"
[dependencies]
chrono = { version = "0.4.38", features = ["serde"], default-features = false }
rayon = "1.10.0"
reqwest = "0.12.7"
scraper = "0.20.0"
serde = { version = "1.0.210", features = ["derive"] }
serde_json = "1.0.128"
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }
toml = "0.8.19"

View File

@ -5,11 +5,15 @@ use news_scraper::News;
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let news = News::extract_news("https://web.archive.org/web/20171214172259/http://www.fermimn.gov.it/news/index.php?pageno=3")
let news = News::extract_news("https://web.archive.org/web/20130609011109/http://www.fermimn.gov.it/news/index.php?pageno=7")
.await;
for element in news {
println!("{element:#?}");
for (i, element) in news.iter().enumerate() {
println!(
"[[news.{}]]\n{}",
i,
toml::to_string_pretty(element).unwrap()
);
}
Ok(())

View File

@ -1,5 +1,6 @@
mod scrape_20140506;
mod scrape_20091228;
use chrono::NaiveDate;
use reqwest::Client;
use scraper::Html;
@ -9,6 +10,8 @@ pub struct News {
title: String,
image_url: Option<String>,
article_url: Option<String>,
body: String,
date: NaiveDate,
}
impl News {
@ -25,6 +28,8 @@ impl News {
let document = Html::parse_document(&html);
// TODO: Select by date
scrape_20140506::scrape(&document)
let news: Vec<Self> = scrape_20091228::scrape(&document);
news
}
}

View File

@ -1,4 +1,5 @@
use super::News;
use chrono::NaiveDate;
use scraper::{ElementRef, Html, Selector};
pub fn scrape(document: &Html) -> Vec<News> {
@ -6,6 +7,7 @@ pub fn scrape(document: &Html) -> Vec<News> {
document
.select(&selector)
.rev()
.filter(|x| {
// Filter out every div with the "bnewtesto" class that doesn't start
// with an h2 of class "titolini"
@ -22,12 +24,15 @@ pub fn scrape(document: &Html) -> Vec<News> {
.map(|news_element| {
let (id, title) = extract_id_and_title(&news_element);
let (image_url, article_url) = extract_img_and_article(&news_element);
let (body, date) = extract_body_datetime_and_materials(&news_element);
News {
id,
title,
image_url,
article_url,
body,
date,
}
})
.collect::<Vec<News>>()
@ -102,3 +107,24 @@ fn extract_img_and_article(news_element: &ElementRef) -> (Option<String>, Option
(image_url, article_url)
}
fn extract_body_datetime_and_materials(news_element: &ElementRef) -> (String, NaiveDate) {
let body = news_element
.child_elements()
.find(|x| x.value().name() == "p")
.unwrap();
let datetime = body.child_elements().last().unwrap();
let datetime =
NaiveDate::parse_from_str(&datetime.inner_html(), "(notizia del %d-%m-%Y)").unwrap();
// news_element
// .child_elements()
// .filter(|x| x.value().name() == "p")
// .skip(1)
// .for_each(|x| println!("{:?}", x));
// println!("---\n\n");
(body.inner_html(), datetime)
}