Started the scraper for the 2014 version of the website
This commit is contained in:
14
src/main.rs
14
src/main.rs
@@ -1,14 +1,16 @@
|
||||
mod wayback_machine;
|
||||
use wayback_machine::Search;
|
||||
// mod wayback_machine;
|
||||
mod news_scraper;
|
||||
|
||||
use news_scraper::News;
|
||||
|
||||
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let records = Search::search("fermimn.gov.it").await;
|
||||
let news = News::extract_news("https://web.archive.org/web/20171214172259/http://www.fermimn.gov.it/news/index.php?pageno=3")
|
||||
.await;
|
||||
|
||||
for row in &records {
|
||||
println!("{:#?}", row.get_wayback_machine_url());
|
||||
for element in news {
|
||||
println!("{element:#?}");
|
||||
}
|
||||
println!("Length: {:#?}", records.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
30
src/news_scraper/mod.rs
Normal file
30
src/news_scraper/mod.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
mod scrape_20140506;
|
||||
|
||||
use reqwest::Client;
|
||||
use scraper::Html;
|
||||
|
||||
#[derive(Debug, serde::Deserialize, serde::Serialize)]
|
||||
pub struct News {
|
||||
id: u32,
|
||||
title: String,
|
||||
image_url: Option<String>,
|
||||
article_url: Option<String>,
|
||||
}
|
||||
|
||||
impl News {
|
||||
pub async fn extract_news(url: &str) -> Vec<Self> {
|
||||
let html = Client::new()
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.unwrap()
|
||||
.text()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let document = Html::parse_document(&html);
|
||||
|
||||
// TODO: Select by date
|
||||
scrape_20140506::scrape(&document)
|
||||
}
|
||||
}
|
104
src/news_scraper/scrape_20140506.rs
Normal file
104
src/news_scraper/scrape_20140506.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
use super::News;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
|
||||
pub fn scrape(document: &Html) -> Vec<News> {
|
||||
let selector = Selector::parse("div.bnewtesto").unwrap();
|
||||
|
||||
document
|
||||
.select(&selector)
|
||||
.filter(|x| {
|
||||
// Filter out every div with the "bnewtesto" class that doesn't start
|
||||
// with an h2 of class "titolini"
|
||||
x.child_elements().next().map_or(false, |first_children| {
|
||||
let first_children = first_children.value();
|
||||
|
||||
first_children.name() == "h2"
|
||||
&& first_children
|
||||
.attr("class")
|
||||
.unwrap_or("")
|
||||
.contains("titolini")
|
||||
})
|
||||
})
|
||||
.map(|news_element| {
|
||||
let (id, title) = extract_id_and_title(&news_element);
|
||||
let (image_url, article_url) = extract_img_and_article(&news_element);
|
||||
|
||||
News {
|
||||
id,
|
||||
title,
|
||||
image_url,
|
||||
article_url,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<News>>()
|
||||
}
|
||||
|
||||
fn extract_id_and_title(news_element: &ElementRef) -> (u32, String) {
|
||||
// Get the title element to extract the title and the ID
|
||||
let title_element = news_element.child_elements().next().unwrap();
|
||||
if title_element.value().name() != "h2"
|
||||
|| !title_element
|
||||
.value()
|
||||
.attr("id")
|
||||
.unwrap_or("")
|
||||
.starts_with("news")
|
||||
|| !title_element
|
||||
.value()
|
||||
.attr("class")
|
||||
.unwrap_or("")
|
||||
.contains("titolini")
|
||||
{
|
||||
todo!()
|
||||
// return Err("First element must be the title");
|
||||
}
|
||||
|
||||
// Get the title and the ID
|
||||
let title = title_element.inner_html();
|
||||
let id = title_element
|
||||
.attr("id")
|
||||
.unwrap()
|
||||
.strip_prefix("news")
|
||||
.unwrap()
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
(id, title)
|
||||
}
|
||||
|
||||
fn extract_img_and_article(news_element: &ElementRef) -> (Option<String>, Option<String>) {
|
||||
let second_element = news_element.child_elements().nth(1).unwrap();
|
||||
|
||||
let mut article_url = None;
|
||||
let image_url = if second_element.value().name() == "div"
|
||||
&& second_element
|
||||
.value()
|
||||
.attr("class")
|
||||
.unwrap_or("")
|
||||
.contains("img")
|
||||
{
|
||||
let mut image = second_element.child_elements().next().unwrap();
|
||||
|
||||
if image.value().name() == "a" {
|
||||
article_url = Some(format!(
|
||||
"https://web.archive.org{}",
|
||||
image.attr("href").unwrap()
|
||||
));
|
||||
image = image.child_elements().next().unwrap();
|
||||
}
|
||||
|
||||
if image.value().name() != "img" {
|
||||
// TODO: Check "alt" and "title"
|
||||
todo!();
|
||||
// return Err("Found div '.img' but it hasn't got any image");
|
||||
}
|
||||
|
||||
Some(format!(
|
||||
"https://web.archive.org{}",
|
||||
image.attr("src").unwrap()
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
(image_url, article_url)
|
||||
}
|
Reference in New Issue
Block a user