Started the scraper for the 2014 version of the website

This commit is contained in:
2024-09-14 17:05:03 +02:00
parent f3d5424dae
commit bca68fba0d
5 changed files with 601 additions and 6 deletions

View File

@@ -1,14 +1,16 @@
mod wayback_machine;
use wayback_machine::Search;
// mod wayback_machine;
mod news_scraper;
use news_scraper::News;
#[tokio::main(flavor = "multi_thread", worker_threads = 10)]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let records = Search::search("fermimn.gov.it").await;
let news = News::extract_news("https://web.archive.org/web/20171214172259/http://www.fermimn.gov.it/news/index.php?pageno=3")
.await;
for row in &records {
println!("{:#?}", row.get_wayback_machine_url());
for element in news {
println!("{element:#?}");
}
println!("Length: {:#?}", records.len());
Ok(())
}

30
src/news_scraper/mod.rs Normal file
View File

@@ -0,0 +1,30 @@
mod scrape_20140506;
use reqwest::Client;
use scraper::Html;
#[derive(Debug, serde::Deserialize, serde::Serialize)]
pub struct News {
id: u32,
title: String,
image_url: Option<String>,
article_url: Option<String>,
}
impl News {
pub async fn extract_news(url: &str) -> Vec<Self> {
let html = Client::new()
.get(url)
.send()
.await
.unwrap()
.text()
.await
.unwrap();
let document = Html::parse_document(&html);
// TODO: Select by date
scrape_20140506::scrape(&document)
}
}

View File

@@ -0,0 +1,104 @@
use super::News;
use scraper::{ElementRef, Html, Selector};
pub fn scrape(document: &Html) -> Vec<News> {
let selector = Selector::parse("div.bnewtesto").unwrap();
document
.select(&selector)
.filter(|x| {
// Filter out every div with the "bnewtesto" class that doesn't start
// with an h2 of class "titolini"
x.child_elements().next().map_or(false, |first_children| {
let first_children = first_children.value();
first_children.name() == "h2"
&& first_children
.attr("class")
.unwrap_or("")
.contains("titolini")
})
})
.map(|news_element| {
let (id, title) = extract_id_and_title(&news_element);
let (image_url, article_url) = extract_img_and_article(&news_element);
News {
id,
title,
image_url,
article_url,
}
})
.collect::<Vec<News>>()
}
fn extract_id_and_title(news_element: &ElementRef) -> (u32, String) {
// Get the title element to extract the title and the ID
let title_element = news_element.child_elements().next().unwrap();
if title_element.value().name() != "h2"
|| !title_element
.value()
.attr("id")
.unwrap_or("")
.starts_with("news")
|| !title_element
.value()
.attr("class")
.unwrap_or("")
.contains("titolini")
{
todo!()
// return Err("First element must be the title");
}
// Get the title and the ID
let title = title_element.inner_html();
let id = title_element
.attr("id")
.unwrap()
.strip_prefix("news")
.unwrap()
.parse()
.unwrap();
(id, title)
}
fn extract_img_and_article(news_element: &ElementRef) -> (Option<String>, Option<String>) {
let second_element = news_element.child_elements().nth(1).unwrap();
let mut article_url = None;
let image_url = if second_element.value().name() == "div"
&& second_element
.value()
.attr("class")
.unwrap_or("")
.contains("img")
{
let mut image = second_element.child_elements().next().unwrap();
if image.value().name() == "a" {
article_url = Some(format!(
"https://web.archive.org{}",
image.attr("href").unwrap()
));
image = image.child_elements().next().unwrap();
}
if image.value().name() != "img" {
// TODO: Check "alt" and "title"
todo!();
// return Err("Found div '.img' but it hasn't got any image");
}
Some(format!(
"https://web.archive.org{}",
image.attr("src").unwrap()
))
} else {
None
};
(image_url, article_url)
}