refactor code and make scraping more robust

2023-04-12 10:54:45 +02:00 · 2023-04-12 10:54:45 +02:00 · 6e3e3e6321
commit 6e3e3e6321
parent 425ff11b5b
4 changed files with 175 additions and 106 deletions
--- a/src/feed.rs
+++ b/src/feed.rs
@ -0,0 +1,68 @@
+use std::{fs::File, sync::Mutex};
+
+use rss::{ChannelBuilder, ItemBuilder};
+use time::OffsetDateTime;
+
+use crate::remaining_place::RemainingPlace;
+
+pub fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
+    let mut channel = ChannelBuilder::default()
+        .title(String::from("LFS Restplatzbörse"))
+        .link(url.to_string())
+        .description(String::from(
+            "Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
+        ))
+        .language(Some("de-DE".to_string()))
+        .build();
+
+    let title = format!(
+        "Restplatzbörse Update - {}",
+        OffsetDateTime::now_local()
+            .unwrap()
+            .format(time::macros::format_description!(
+                "[year]-[month]-[day] [hour]:[minute]"
+            ))
+            .unwrap()
+    );
+
+    let content = places
+        .iter()
+        .map(|place| {
+            format!(
+                "{} - {} - {} - {} Plätze</br>",
+                place.id, place.description, place.date, place.free
+            )
+        })
+        .collect::<Vec<String>>()
+        .join("\n");
+
+    let item = ItemBuilder::default()
+        .title(Some(title))
+        .content(Some(content))
+        .build();
+
+    channel.set_items(vec![item]);
+
+    let output = File::create(rss_file).unwrap();
+    channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
+}
+
+pub fn should_feed_be_updated(
+    new_places: &Vec<RemainingPlace>,
+    last_places: &Mutex<Vec<RemainingPlace>>,
+) -> bool {
+    let mut last_places = last_places.lock().unwrap();
+
+    let are_the_same_places = new_places.len() == last_places.len()
+        && new_places
+            .iter()
+            .zip(last_places.iter())
+            .all(|(one, two)| one == two);
+
+    if !are_the_same_places {
+        last_places.clear();
+        last_places.append(&mut new_places.clone());
+    }
+
+    !are_the_same_places
+}
--- a/src/html_parser.rs
+++ b/src/html_parser.rs
@ -0,0 +1,31 @@
+pub fn parse_inner_node(inner_node: &str) -> String {
+    let start = inner_node.find(">");
+    let end = inner_node.rfind("<");
+
+    let mut start_index = 0;
+    let mut end_index = inner_node.len() - 1;
+
+    if let Some(start) = start {
+        start_index = start + 1;
+    }
+
+    if let Some(end) = end {
+        let new_end = end - 1;
+
+        if new_end >= start_index {
+            end_index = new_end
+        }
+    }
+
+    let inner_inner_node = &inner_node[start_index..=end_index];
+
+    if inner_inner_node.contains("<") || inner_inner_node.contains(">") {
+        return parse_inner_node(inner_inner_node);
+    }
+
+    inner_inner_node.to_string()
+}
+
+pub fn replace_html_codes(content: &str) -> String {
+    content.replace("&nbsp;", " ")
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,18 +1,15 @@
 use clokwerk::{Scheduler, TimeUnits};
-use rss::{ChannelBuilder, ItemBuilder};
-use std::fs::File;
+use feed::should_feed_be_updated;
+use remaining_place::{get_current_places, RemainingPlace};
 use std::sync::{Arc, Mutex};
 use std::thread;
 use std::time::Duration;
-use time::OffsetDateTime;

-#[derive(Clone, PartialEq, Eq)]
-struct RemainingPlace {
-    id: String,
-    description: String,
-    date: String,
-    free: usize,
-}
+use crate::feed::update_rss_file;
+
+mod feed;
+mod html_parser;
+mod remaining_place;

 struct Args {
    url: String,
@ -22,8 +19,13 @@ struct Args {
 fn parse_args() -> Result<Args, pico_args::Error> {
    let mut pargs = pico_args::Arguments::from_env();

+    let url = pargs.opt_value_from_str("--url")?;
+
    let args = Args {
-        url: pargs.value_from_str("--url")?,
+        url: match url {
+            Some(val) => val,
+            None => String::from("https://www.lfs.sachsen.de/restplatzboerse-5152.html"),
+        },
        rss_file: pargs.value_from_str("--rss-file")?,
    };

@ -42,9 +44,8 @@ fn main() {
    let last_places: Arc<Mutex<Vec<RemainingPlace>>> = Arc::new(Mutex::new(Vec::new()));
    let mut scheduler = Scheduler::new();

-    scheduler
-        .every(30.minutes())
-        .run(move || match get_current_places(&args.url) {
+    let check_for_places = move || {
+        match get_current_places(&args.url) {
            Ok(places) => {
                if should_feed_be_updated(&places, &last_places) {
                    update_rss_file(places, &args.url, &args.rss_file);
@ -56,101 +57,17 @@ fn main() {
            Err(error) => {
                println!("Error: {}", error);
            }
-        });
+        };
+    };
+
+    // now
+    check_for_places();
+
+    // and every 30 min
+    scheduler.every(30.minutes()).run(check_for_places);

    loop {
        scheduler.run_pending();
        thread::sleep(Duration::from_millis(10));
    }
 }
-
-fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
-    let body = reqwest::blocking::get(url)?.text()?;
-
-    let start = body.find("<tbody").unwrap();
-    let end = body.find("</tbody>").unwrap();
-
-    let table = &body[start..=(end + 7)];
-
-    let mut places: Vec<RemainingPlace> = Vec::new();
-    let mut iter = table.lines();
-
-    while let Some(line) = iter.next() {
-        if line.contains("<tr>") {
-            let id = parse_node(iter.next().unwrap());
-            let description = parse_node(iter.next().unwrap());
-            let date = parse_node(iter.next().unwrap());
-            let free = parse_node(iter.next().unwrap()).parse().unwrap();
-
-            let place = RemainingPlace {
-                id,
-                description,
-                date,
-                free,
-            };
-            places.push(place);
-        }
-    }
-
-    Ok(places)
-}
-
-fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
-    let mut channel = ChannelBuilder::default()
-        .title(String::from("LFS Restplatzbörse"))
-        .link(url.to_string())
-        .description(String::from(
-            "Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
-        ))
-        .language(Some("de-DE".to_string()))
-        .build();
-
-    let title = format!("Restplatzbörse Update - {}", OffsetDateTime::now_local().unwrap().format(time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]")).unwrap());
-    let content = places
-        .iter()
-        .map(|place| {
-            format!(
-                "{} - {} - {} - {} Plätze</br>",
-                place.id, place.description, place.date, place.free
-            )
-        })
-        .collect::<Vec<String>>()
-        .join("\n");
-
-    let item = ItemBuilder::default()
-        .title(Some(title))
-        .content(Some(content))
-        .build();
-
-    channel.set_items(vec![item]);
-
-    let output = File::create(rss_file).unwrap();
-    channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
-}
-
-fn should_feed_be_updated(
-    new_places: &Vec<RemainingPlace>,
-    last_places: &Mutex<Vec<RemainingPlace>>,
-) -> bool {
-    let mut last_places = last_places.lock().unwrap();
-
-    let are_the_same_places = new_places.len() == last_places.len()
-        && new_places
-            .iter()
-            .zip(last_places.iter())
-            .all(|(one, two)| one == two);
-
-    if !are_the_same_places {
-        last_places.clear();
-        last_places.append(&mut new_places.clone());
-    }
-
-    !are_the_same_places
-}
-
-fn parse_node(input: &str) -> String {
-    let start = input.find(">").unwrap();
-    let end = input.find("</").unwrap();
-
-    input[(start + 1)..end].to_string()
-}
--- a/src/remaining_place.rs
+++ b/src/remaining_place.rs
@ -0,0 +1,53 @@
+use crate::html_parser::{parse_inner_node, replace_html_codes};
+
+#[derive(Clone, PartialEq, Eq)]
+pub struct RemainingPlace {
+    pub id: String,
+    pub description: String,
+    pub date: String,
+    pub free: usize,
+}
+
+pub fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
+    let body = reqwest::blocking::get(url)?.text()?;
+
+    let start = body.find("<tbody").unwrap();
+    let end = body.find("</tbody>").unwrap();
+
+    let table = &body[start..=(end + 7)];
+
+    let mut places: Vec<RemainingPlace> = Vec::new();
+    let mut lines: Vec<String> = Vec::new();
+
+    let mut line = table.replace("\n", "").replace("\r", "");
+
+    while let Some(begin) = line.find("<td") {
+        match line.find("</td>") {
+            Some(end) => {
+                let inner_node = &line[(begin + 5)..end];
+                let content = parse_inner_node(inner_node);
+                let escaped_content = replace_html_codes(&content);
+
+                lines.push(escaped_content);
+                line.replace_range(begin..=end + 5, "");
+            }
+            None => break,
+        }
+    }
+
+    lines.chunks(4).for_each(|chunk| {
+        let new_remaining_place = RemainingPlace {
+            id: chunk[0].clone(),
+            description: chunk[1].clone(),
+            date: chunk[2].clone(),
+            free: match chunk[3].parse() {
+                Ok(value) => value,
+                Err(_) => 0,
+            },
+        };
+
+        places.push(new_remaining_place);
+    });
+
+    Ok(places)
+}