From 6e3e3e6321a0e6c4898081254280aba5749a1f5c Mon Sep 17 00:00:00 2001 From: Max Hohlfeld Date: Wed, 12 Apr 2023 10:54:45 +0200 Subject: [PATCH] refactor code and make scraping more robust --- src/feed.rs | 68 ++++++++++++++++++++++ src/html_parser.rs | 31 ++++++++++ src/main.rs | 129 ++++++++--------------------------------- src/remaining_place.rs | 53 +++++++++++++++++ 4 files changed, 175 insertions(+), 106 deletions(-) create mode 100644 src/feed.rs create mode 100644 src/html_parser.rs create mode 100644 src/remaining_place.rs diff --git a/src/feed.rs b/src/feed.rs new file mode 100644 index 0000000..b4cefc3 --- /dev/null +++ b/src/feed.rs @@ -0,0 +1,68 @@ +use std::{fs::File, sync::Mutex}; + +use rss::{ChannelBuilder, ItemBuilder}; +use time::OffsetDateTime; + +use crate::remaining_place::RemainingPlace; + +pub fn update_rss_file(places: Vec, url: &str, rss_file: &str) { + let mut channel = ChannelBuilder::default() + .title(String::from("LFS Restplatzbörse")) + .link(url.to_string()) + .description(String::from( + "Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.", + )) + .language(Some("de-DE".to_string())) + .build(); + + let title = format!( + "Restplatzbörse Update - {}", + OffsetDateTime::now_local() + .unwrap() + .format(time::macros::format_description!( + "[year]-[month]-[day] [hour]:[minute]" + )) + .unwrap() + ); + + let content = places + .iter() + .map(|place| { + format!( + "{} - {} - {} - {} Plätze
", + place.id, place.description, place.date, place.free + ) + }) + .collect::>() + .join("\n"); + + let item = ItemBuilder::default() + .title(Some(title)) + .content(Some(content)) + .build(); + + channel.set_items(vec![item]); + + let output = File::create(rss_file).unwrap(); + channel.pretty_write_to(output, ' ' as u8, 2).unwrap(); +} + +pub fn should_feed_be_updated( + new_places: &Vec, + last_places: &Mutex>, +) -> bool { + let mut last_places = last_places.lock().unwrap(); + + let are_the_same_places = new_places.len() == last_places.len() + && new_places + .iter() + .zip(last_places.iter()) + .all(|(one, two)| one == two); + + if !are_the_same_places { + last_places.clear(); + last_places.append(&mut new_places.clone()); + } + + !are_the_same_places +} diff --git a/src/html_parser.rs b/src/html_parser.rs new file mode 100644 index 0000000..c0b36af --- /dev/null +++ b/src/html_parser.rs @@ -0,0 +1,31 @@ +pub fn parse_inner_node(inner_node: &str) -> String { + let start = inner_node.find(">"); + let end = inner_node.rfind("<"); + + let mut start_index = 0; + let mut end_index = inner_node.len() - 1; + + if let Some(start) = start { + start_index = start + 1; + } + + if let Some(end) = end { + let new_end = end - 1; + + if new_end >= start_index { + end_index = new_end + } + } + + let inner_inner_node = &inner_node[start_index..=end_index]; + + if inner_inner_node.contains("<") || inner_inner_node.contains(">") { + return parse_inner_node(inner_inner_node); + } + + inner_inner_node.to_string() +} + +pub fn replace_html_codes(content: &str) -> String { + content.replace(" ", " ") +} diff --git a/src/main.rs b/src/main.rs index 3928a1c..951e09b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,18 +1,15 @@ use clokwerk::{Scheduler, TimeUnits}; -use rss::{ChannelBuilder, ItemBuilder}; -use std::fs::File; +use feed::should_feed_be_updated; +use remaining_place::{get_current_places, RemainingPlace}; use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; -use time::OffsetDateTime; -#[derive(Clone, PartialEq, Eq)] -struct RemainingPlace { - id: String, - description: String, - date: String, - free: usize, -} +use crate::feed::update_rss_file; + +mod feed; +mod html_parser; +mod remaining_place; struct Args { url: String, @@ -22,8 +19,13 @@ struct Args { fn parse_args() -> Result { let mut pargs = pico_args::Arguments::from_env(); + let url = pargs.opt_value_from_str("--url")?; + let args = Args { - url: pargs.value_from_str("--url")?, + url: match url { + Some(val) => val, + None => String::from("https://www.lfs.sachsen.de/restplatzboerse-5152.html"), + }, rss_file: pargs.value_from_str("--rss-file")?, }; @@ -42,9 +44,8 @@ fn main() { let last_places: Arc>> = Arc::new(Mutex::new(Vec::new())); let mut scheduler = Scheduler::new(); - scheduler - .every(30.minutes()) - .run(move || match get_current_places(&args.url) { + let check_for_places = move || { + match get_current_places(&args.url) { Ok(places) => { if should_feed_be_updated(&places, &last_places) { update_rss_file(places, &args.url, &args.rss_file); @@ -56,101 +57,17 @@ fn main() { Err(error) => { println!("Error: {}", error); } - }); + }; + }; + + // now + check_for_places(); + + // and every 30 min + scheduler.every(30.minutes()).run(check_for_places); loop { scheduler.run_pending(); thread::sleep(Duration::from_millis(10)); } } - -fn get_current_places(url: &str) -> Result, reqwest::Error> { - let body = reqwest::blocking::get(url)?.text()?; - - let start = body.find("").unwrap(); - - let table = &body[start..=(end + 7)]; - - let mut places: Vec = Vec::new(); - let mut iter = table.lines(); - - while let Some(line) = iter.next() { - if line.contains("") { - let id = parse_node(iter.next().unwrap()); - let description = parse_node(iter.next().unwrap()); - let date = parse_node(iter.next().unwrap()); - let free = parse_node(iter.next().unwrap()).parse().unwrap(); - - let place = RemainingPlace { - id, - description, - date, - free, - }; - places.push(place); - } - } - - Ok(places) -} - -fn update_rss_file(places: Vec, url: &str, rss_file: &str) { - let mut channel = ChannelBuilder::default() - .title(String::from("LFS Restplatzbörse")) - .link(url.to_string()) - .description(String::from( - "Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.", - )) - .language(Some("de-DE".to_string())) - .build(); - - let title = format!("Restplatzbörse Update - {}", OffsetDateTime::now_local().unwrap().format(time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]")).unwrap()); - let content = places - .iter() - .map(|place| { - format!( - "{} - {} - {} - {} Plätze
", - place.id, place.description, place.date, place.free - ) - }) - .collect::>() - .join("\n"); - - let item = ItemBuilder::default() - .title(Some(title)) - .content(Some(content)) - .build(); - - channel.set_items(vec![item]); - - let output = File::create(rss_file).unwrap(); - channel.pretty_write_to(output, ' ' as u8, 2).unwrap(); -} - -fn should_feed_be_updated( - new_places: &Vec, - last_places: &Mutex>, -) -> bool { - let mut last_places = last_places.lock().unwrap(); - - let are_the_same_places = new_places.len() == last_places.len() - && new_places - .iter() - .zip(last_places.iter()) - .all(|(one, two)| one == two); - - if !are_the_same_places { - last_places.clear(); - last_places.append(&mut new_places.clone()); - } - - !are_the_same_places -} - -fn parse_node(input: &str) -> String { - let start = input.find(">").unwrap(); - let end = input.find(" Result, reqwest::Error> { + let body = reqwest::blocking::get(url)?.text()?; + + let start = body.find("").unwrap(); + + let table = &body[start..=(end + 7)]; + + let mut places: Vec = Vec::new(); + let mut lines: Vec = Vec::new(); + + let mut line = table.replace("\n", "").replace("\r", ""); + + while let Some(begin) = line.find("") { + Some(end) => { + let inner_node = &line[(begin + 5)..end]; + let content = parse_inner_node(inner_node); + let escaped_content = replace_html_codes(&content); + + lines.push(escaped_content); + line.replace_range(begin..=end + 5, ""); + } + None => break, + } + } + + lines.chunks(4).for_each(|chunk| { + let new_remaining_place = RemainingPlace { + id: chunk[0].clone(), + description: chunk[1].clone(), + date: chunk[2].clone(), + free: match chunk[3].parse() { + Ok(value) => value, + Err(_) => 0, + }, + }; + + places.push(new_remaining_place); + }); + + Ok(places) +}