Added full body and date
This commit is contained in:
parent
bca68fba0d
commit
8fb3d99a95
73
Cargo.lock
generated
73
Cargo.lock
generated
@ -102,6 +102,16 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
@ -239,12 +249,14 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
|
||||
name = "fermi-news-scraper"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"rayon",
|
||||
"reqwest",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -664,6 +676,15 @@ version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.4"
|
||||
@ -1200,6 +1221,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "0.6.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_urlencoded"
|
||||
version = "0.7.1"
|
||||
@ -1443,6 +1473,40 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.8.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"serde",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower"
|
||||
version = "0.4.13"
|
||||
@ -1761,6 +1825,15 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
|
@ -4,9 +4,11 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
chrono = { version = "0.4.38", features = ["serde"], default-features = false }
|
||||
rayon = "1.10.0"
|
||||
reqwest = "0.12.7"
|
||||
scraper = "0.20.0"
|
||||
serde = { version = "1.0.210", features = ["derive"] }
|
||||
serde_json = "1.0.128"
|
||||
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }
|
||||
toml = "0.8.19"
|
||||
|
10
src/main.rs
10
src/main.rs
@ -5,11 +5,15 @@ use news_scraper::News;
|
||||
|
||||
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let news = News::extract_news("https://web.archive.org/web/20171214172259/http://www.fermimn.gov.it/news/index.php?pageno=3")
|
||||
let news = News::extract_news("https://web.archive.org/web/20130609011109/http://www.fermimn.gov.it/news/index.php?pageno=7")
|
||||
.await;
|
||||
|
||||
for element in news {
|
||||
println!("{element:#?}");
|
||||
for (i, element) in news.iter().enumerate() {
|
||||
println!(
|
||||
"[[news.{}]]\n{}",
|
||||
i,
|
||||
toml::to_string_pretty(element).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
@ -1,5 +1,6 @@
|
||||
mod scrape_20140506;
|
||||
mod scrape_20091228;
|
||||
|
||||
use chrono::NaiveDate;
|
||||
use reqwest::Client;
|
||||
use scraper::Html;
|
||||
|
||||
@ -9,6 +10,8 @@ pub struct News {
|
||||
title: String,
|
||||
image_url: Option<String>,
|
||||
article_url: Option<String>,
|
||||
body: String,
|
||||
date: NaiveDate,
|
||||
}
|
||||
|
||||
impl News {
|
||||
@ -25,6 +28,8 @@ impl News {
|
||||
let document = Html::parse_document(&html);
|
||||
|
||||
// TODO: Select by date
|
||||
scrape_20140506::scrape(&document)
|
||||
let news: Vec<Self> = scrape_20091228::scrape(&document);
|
||||
|
||||
news
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
use super::News;
|
||||
use chrono::NaiveDate;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
|
||||
pub fn scrape(document: &Html) -> Vec<News> {
|
||||
@ -6,6 +7,7 @@ pub fn scrape(document: &Html) -> Vec<News> {
|
||||
|
||||
document
|
||||
.select(&selector)
|
||||
.rev()
|
||||
.filter(|x| {
|
||||
// Filter out every div with the "bnewtesto" class that doesn't start
|
||||
// with an h2 of class "titolini"
|
||||
@ -22,12 +24,15 @@ pub fn scrape(document: &Html) -> Vec<News> {
|
||||
.map(|news_element| {
|
||||
let (id, title) = extract_id_and_title(&news_element);
|
||||
let (image_url, article_url) = extract_img_and_article(&news_element);
|
||||
let (body, date) = extract_body_datetime_and_materials(&news_element);
|
||||
|
||||
News {
|
||||
id,
|
||||
title,
|
||||
image_url,
|
||||
article_url,
|
||||
body,
|
||||
date,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<News>>()
|
||||
@ -102,3 +107,24 @@ fn extract_img_and_article(news_element: &ElementRef) -> (Option<String>, Option
|
||||
|
||||
(image_url, article_url)
|
||||
}
|
||||
|
||||
fn extract_body_datetime_and_materials(news_element: &ElementRef) -> (String, NaiveDate) {
|
||||
let body = news_element
|
||||
.child_elements()
|
||||
.find(|x| x.value().name() == "p")
|
||||
.unwrap();
|
||||
|
||||
let datetime = body.child_elements().last().unwrap();
|
||||
let datetime =
|
||||
NaiveDate::parse_from_str(&datetime.inner_html(), "(notizia del %d-%m-%Y)").unwrap();
|
||||
|
||||
// news_element
|
||||
// .child_elements()
|
||||
// .filter(|x| x.value().name() == "p")
|
||||
// .skip(1)
|
||||
// .for_each(|x| println!("{:?}", x));
|
||||
|
||||
// println!("---\n\n");
|
||||
|
||||
(body.inner_html(), datetime)
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user