refactor code and make scraping more robust

This commit is contained in:
Max Hohlfeld 2023-04-12 10:54:45 +02:00
parent 425ff11b5b
commit 6e3e3e6321
4 changed files with 175 additions and 106 deletions

68
src/feed.rs Normal file
View File

@ -0,0 +1,68 @@
use std::{fs::File, sync::Mutex};
use rss::{ChannelBuilder, ItemBuilder};
use time::OffsetDateTime;
use crate::remaining_place::RemainingPlace;
pub fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
let mut channel = ChannelBuilder::default()
.title(String::from("LFS Restplatzbörse"))
.link(url.to_string())
.description(String::from(
"Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
))
.language(Some("de-DE".to_string()))
.build();
let title = format!(
"Restplatzbörse Update - {}",
OffsetDateTime::now_local()
.unwrap()
.format(time::macros::format_description!(
"[year]-[month]-[day] [hour]:[minute]"
))
.unwrap()
);
let content = places
.iter()
.map(|place| {
format!(
"{} - {} - {} - {} Plätze</br>",
place.id, place.description, place.date, place.free
)
})
.collect::<Vec<String>>()
.join("\n");
let item = ItemBuilder::default()
.title(Some(title))
.content(Some(content))
.build();
channel.set_items(vec![item]);
let output = File::create(rss_file).unwrap();
channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
}
pub fn should_feed_be_updated(
new_places: &Vec<RemainingPlace>,
last_places: &Mutex<Vec<RemainingPlace>>,
) -> bool {
let mut last_places = last_places.lock().unwrap();
let are_the_same_places = new_places.len() == last_places.len()
&& new_places
.iter()
.zip(last_places.iter())
.all(|(one, two)| one == two);
if !are_the_same_places {
last_places.clear();
last_places.append(&mut new_places.clone());
}
!are_the_same_places
}

31
src/html_parser.rs Normal file
View File

@ -0,0 +1,31 @@
pub fn parse_inner_node(inner_node: &str) -> String {
let start = inner_node.find(">");
let end = inner_node.rfind("<");
let mut start_index = 0;
let mut end_index = inner_node.len() - 1;
if let Some(start) = start {
start_index = start + 1;
}
if let Some(end) = end {
let new_end = end - 1;
if new_end >= start_index {
end_index = new_end
}
}
let inner_inner_node = &inner_node[start_index..=end_index];
if inner_inner_node.contains("<") || inner_inner_node.contains(">") {
return parse_inner_node(inner_inner_node);
}
inner_inner_node.to_string()
}
pub fn replace_html_codes(content: &str) -> String {
content.replace("&nbsp;", " ")
}

View File

@ -1,18 +1,15 @@
use clokwerk::{Scheduler, TimeUnits};
use rss::{ChannelBuilder, ItemBuilder};
use std::fs::File;
use feed::should_feed_be_updated;
use remaining_place::{get_current_places, RemainingPlace};
use std::sync::{Arc, Mutex};
use std::thread;
use std::time::Duration;
use time::OffsetDateTime;
#[derive(Clone, PartialEq, Eq)]
struct RemainingPlace {
id: String,
description: String,
date: String,
free: usize,
}
use crate::feed::update_rss_file;
mod feed;
mod html_parser;
mod remaining_place;
struct Args {
url: String,
@ -22,8 +19,13 @@ struct Args {
fn parse_args() -> Result<Args, pico_args::Error> {
let mut pargs = pico_args::Arguments::from_env();
let url = pargs.opt_value_from_str("--url")?;
let args = Args {
url: pargs.value_from_str("--url")?,
url: match url {
Some(val) => val,
None => String::from("https://www.lfs.sachsen.de/restplatzboerse-5152.html"),
},
rss_file: pargs.value_from_str("--rss-file")?,
};
@ -42,9 +44,8 @@ fn main() {
let last_places: Arc<Mutex<Vec<RemainingPlace>>> = Arc::new(Mutex::new(Vec::new()));
let mut scheduler = Scheduler::new();
scheduler
.every(30.minutes())
.run(move || match get_current_places(&args.url) {
let check_for_places = move || {
match get_current_places(&args.url) {
Ok(places) => {
if should_feed_be_updated(&places, &last_places) {
update_rss_file(places, &args.url, &args.rss_file);
@ -56,101 +57,17 @@ fn main() {
Err(error) => {
println!("Error: {}", error);
}
});
};
};
// now
check_for_places();
// and every 30 min
scheduler.every(30.minutes()).run(check_for_places);
loop {
scheduler.run_pending();
thread::sleep(Duration::from_millis(10));
}
}
fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
let body = reqwest::blocking::get(url)?.text()?;
let start = body.find("<tbody").unwrap();
let end = body.find("</tbody>").unwrap();
let table = &body[start..=(end + 7)];
let mut places: Vec<RemainingPlace> = Vec::new();
let mut iter = table.lines();
while let Some(line) = iter.next() {
if line.contains("<tr>") {
let id = parse_node(iter.next().unwrap());
let description = parse_node(iter.next().unwrap());
let date = parse_node(iter.next().unwrap());
let free = parse_node(iter.next().unwrap()).parse().unwrap();
let place = RemainingPlace {
id,
description,
date,
free,
};
places.push(place);
}
}
Ok(places)
}
fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
let mut channel = ChannelBuilder::default()
.title(String::from("LFS Restplatzbörse"))
.link(url.to_string())
.description(String::from(
"Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
))
.language(Some("de-DE".to_string()))
.build();
let title = format!("Restplatzbörse Update - {}", OffsetDateTime::now_local().unwrap().format(time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]")).unwrap());
let content = places
.iter()
.map(|place| {
format!(
"{} - {} - {} - {} Plätze</br>",
place.id, place.description, place.date, place.free
)
})
.collect::<Vec<String>>()
.join("\n");
let item = ItemBuilder::default()
.title(Some(title))
.content(Some(content))
.build();
channel.set_items(vec![item]);
let output = File::create(rss_file).unwrap();
channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
}
fn should_feed_be_updated(
new_places: &Vec<RemainingPlace>,
last_places: &Mutex<Vec<RemainingPlace>>,
) -> bool {
let mut last_places = last_places.lock().unwrap();
let are_the_same_places = new_places.len() == last_places.len()
&& new_places
.iter()
.zip(last_places.iter())
.all(|(one, two)| one == two);
if !are_the_same_places {
last_places.clear();
last_places.append(&mut new_places.clone());
}
!are_the_same_places
}
fn parse_node(input: &str) -> String {
let start = input.find(">").unwrap();
let end = input.find("</").unwrap();
input[(start + 1)..end].to_string()
}

53
src/remaining_place.rs Normal file
View File

@ -0,0 +1,53 @@
use crate::html_parser::{parse_inner_node, replace_html_codes};
#[derive(Clone, PartialEq, Eq)]
pub struct RemainingPlace {
pub id: String,
pub description: String,
pub date: String,
pub free: usize,
}
pub fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
let body = reqwest::blocking::get(url)?.text()?;
let start = body.find("<tbody").unwrap();
let end = body.find("</tbody>").unwrap();
let table = &body[start..=(end + 7)];
let mut places: Vec<RemainingPlace> = Vec::new();
let mut lines: Vec<String> = Vec::new();
let mut line = table.replace("\n", "").replace("\r", "");
while let Some(begin) = line.find("<td") {
match line.find("</td>") {
Some(end) => {
let inner_node = &line[(begin + 5)..end];
let content = parse_inner_node(inner_node);
let escaped_content = replace_html_codes(&content);
lines.push(escaped_content);
line.replace_range(begin..=end + 5, "");
}
None => break,
}
}
lines.chunks(4).for_each(|chunk| {
let new_remaining_place = RemainingPlace {
id: chunk[0].clone(),
description: chunk[1].clone(),
date: chunk[2].clone(),
free: match chunk[3].parse() {
Ok(value) => value,
Err(_) => 0,
},
};
places.push(new_remaining_place);
});
Ok(places)
}