Added full body and date
This commit is contained in:
parent
bca68fba0d
commit
8fb3d99a95
73
Cargo.lock
generated
73
Cargo.lock
generated
@ -102,6 +102,16 @@ version = "1.0.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chrono"
|
||||||
|
version = "0.4.38"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
|
||||||
|
dependencies = [
|
||||||
|
"num-traits",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "core-foundation"
|
name = "core-foundation"
|
||||||
version = "0.9.4"
|
version = "0.9.4"
|
||||||
@ -239,12 +249,14 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
|
|||||||
name = "fermi-news-scraper"
|
name = "fermi-news-scraper"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"chrono",
|
||||||
"rayon",
|
"rayon",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"scraper",
|
"scraper",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"toml",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -664,6 +676,15 @@ version = "1.0.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "num-traits"
|
||||||
|
version = "0.2.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "object"
|
name = "object"
|
||||||
version = "0.36.4"
|
version = "0.36.4"
|
||||||
@ -1200,6 +1221,15 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_spanned"
|
||||||
|
version = "0.6.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_urlencoded"
|
name = "serde_urlencoded"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@ -1443,6 +1473,40 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "toml"
|
||||||
|
version = "0.8.19"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
"serde_spanned",
|
||||||
|
"toml_datetime",
|
||||||
|
"toml_edit",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "toml_datetime"
|
||||||
|
version = "0.6.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "toml_edit"
|
||||||
|
version = "0.22.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf"
|
||||||
|
dependencies = [
|
||||||
|
"indexmap",
|
||||||
|
"serde",
|
||||||
|
"serde_spanned",
|
||||||
|
"toml_datetime",
|
||||||
|
"winnow",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tower"
|
name = "tower"
|
||||||
version = "0.4.13"
|
version = "0.4.13"
|
||||||
@ -1761,6 +1825,15 @@ version = "0.52.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winnow"
|
||||||
|
version = "0.6.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zerocopy"
|
name = "zerocopy"
|
||||||
version = "0.7.35"
|
version = "0.7.35"
|
||||||
|
@ -4,9 +4,11 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
chrono = { version = "0.4.38", features = ["serde"], default-features = false }
|
||||||
rayon = "1.10.0"
|
rayon = "1.10.0"
|
||||||
reqwest = "0.12.7"
|
reqwest = "0.12.7"
|
||||||
scraper = "0.20.0"
|
scraper = "0.20.0"
|
||||||
serde = { version = "1.0.210", features = ["derive"] }
|
serde = { version = "1.0.210", features = ["derive"] }
|
||||||
serde_json = "1.0.128"
|
serde_json = "1.0.128"
|
||||||
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }
|
tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] }
|
||||||
|
toml = "0.8.19"
|
||||||
|
10
src/main.rs
10
src/main.rs
@ -5,11 +5,15 @@ use news_scraper::News;
|
|||||||
|
|
||||||
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
||||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
let news = News::extract_news("https://web.archive.org/web/20171214172259/http://www.fermimn.gov.it/news/index.php?pageno=3")
|
let news = News::extract_news("https://web.archive.org/web/20130609011109/http://www.fermimn.gov.it/news/index.php?pageno=7")
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
for element in news {
|
for (i, element) in news.iter().enumerate() {
|
||||||
println!("{element:#?}");
|
println!(
|
||||||
|
"[[news.{}]]\n{}",
|
||||||
|
i,
|
||||||
|
toml::to_string_pretty(element).unwrap()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
mod scrape_20140506;
|
mod scrape_20091228;
|
||||||
|
|
||||||
|
use chrono::NaiveDate;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use scraper::Html;
|
use scraper::Html;
|
||||||
|
|
||||||
@ -9,6 +10,8 @@ pub struct News {
|
|||||||
title: String,
|
title: String,
|
||||||
image_url: Option<String>,
|
image_url: Option<String>,
|
||||||
article_url: Option<String>,
|
article_url: Option<String>,
|
||||||
|
body: String,
|
||||||
|
date: NaiveDate,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl News {
|
impl News {
|
||||||
@ -25,6 +28,8 @@ impl News {
|
|||||||
let document = Html::parse_document(&html);
|
let document = Html::parse_document(&html);
|
||||||
|
|
||||||
// TODO: Select by date
|
// TODO: Select by date
|
||||||
scrape_20140506::scrape(&document)
|
let news: Vec<Self> = scrape_20091228::scrape(&document);
|
||||||
|
|
||||||
|
news
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use super::News;
|
use super::News;
|
||||||
|
use chrono::NaiveDate;
|
||||||
use scraper::{ElementRef, Html, Selector};
|
use scraper::{ElementRef, Html, Selector};
|
||||||
|
|
||||||
pub fn scrape(document: &Html) -> Vec<News> {
|
pub fn scrape(document: &Html) -> Vec<News> {
|
||||||
@ -6,6 +7,7 @@ pub fn scrape(document: &Html) -> Vec<News> {
|
|||||||
|
|
||||||
document
|
document
|
||||||
.select(&selector)
|
.select(&selector)
|
||||||
|
.rev()
|
||||||
.filter(|x| {
|
.filter(|x| {
|
||||||
// Filter out every div with the "bnewtesto" class that doesn't start
|
// Filter out every div with the "bnewtesto" class that doesn't start
|
||||||
// with an h2 of class "titolini"
|
// with an h2 of class "titolini"
|
||||||
@ -22,12 +24,15 @@ pub fn scrape(document: &Html) -> Vec<News> {
|
|||||||
.map(|news_element| {
|
.map(|news_element| {
|
||||||
let (id, title) = extract_id_and_title(&news_element);
|
let (id, title) = extract_id_and_title(&news_element);
|
||||||
let (image_url, article_url) = extract_img_and_article(&news_element);
|
let (image_url, article_url) = extract_img_and_article(&news_element);
|
||||||
|
let (body, date) = extract_body_datetime_and_materials(&news_element);
|
||||||
|
|
||||||
News {
|
News {
|
||||||
id,
|
id,
|
||||||
title,
|
title,
|
||||||
image_url,
|
image_url,
|
||||||
article_url,
|
article_url,
|
||||||
|
body,
|
||||||
|
date,
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect::<Vec<News>>()
|
.collect::<Vec<News>>()
|
||||||
@ -102,3 +107,24 @@ fn extract_img_and_article(news_element: &ElementRef) -> (Option<String>, Option
|
|||||||
|
|
||||||
(image_url, article_url)
|
(image_url, article_url)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn extract_body_datetime_and_materials(news_element: &ElementRef) -> (String, NaiveDate) {
|
||||||
|
let body = news_element
|
||||||
|
.child_elements()
|
||||||
|
.find(|x| x.value().name() == "p")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let datetime = body.child_elements().last().unwrap();
|
||||||
|
let datetime =
|
||||||
|
NaiveDate::parse_from_str(&datetime.inner_html(), "(notizia del %d-%m-%Y)").unwrap();
|
||||||
|
|
||||||
|
// news_element
|
||||||
|
// .child_elements()
|
||||||
|
// .filter(|x| x.value().name() == "p")
|
||||||
|
// .skip(1)
|
||||||
|
// .for_each(|x| println!("{:?}", x));
|
||||||
|
|
||||||
|
// println!("---\n\n");
|
||||||
|
|
||||||
|
(body.inner_html(), datetime)
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user