refactor code and make scraping more robust
This commit is contained in:
parent
425ff11b5b
commit
6e3e3e6321
68
src/feed.rs
Normal file
68
src/feed.rs
Normal file
@ -0,0 +1,68 @@
|
||||
use std::{fs::File, sync::Mutex};
|
||||
|
||||
use rss::{ChannelBuilder, ItemBuilder};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::remaining_place::RemainingPlace;
|
||||
|
||||
pub fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
|
||||
let mut channel = ChannelBuilder::default()
|
||||
.title(String::from("LFS Restplatzbörse"))
|
||||
.link(url.to_string())
|
||||
.description(String::from(
|
||||
"Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
|
||||
))
|
||||
.language(Some("de-DE".to_string()))
|
||||
.build();
|
||||
|
||||
let title = format!(
|
||||
"Restplatzbörse Update - {}",
|
||||
OffsetDateTime::now_local()
|
||||
.unwrap()
|
||||
.format(time::macros::format_description!(
|
||||
"[year]-[month]-[day] [hour]:[minute]"
|
||||
))
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
let content = places
|
||||
.iter()
|
||||
.map(|place| {
|
||||
format!(
|
||||
"{} - {} - {} - {} Plätze</br>",
|
||||
place.id, place.description, place.date, place.free
|
||||
)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n");
|
||||
|
||||
let item = ItemBuilder::default()
|
||||
.title(Some(title))
|
||||
.content(Some(content))
|
||||
.build();
|
||||
|
||||
channel.set_items(vec![item]);
|
||||
|
||||
let output = File::create(rss_file).unwrap();
|
||||
channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
|
||||
}
|
||||
|
||||
pub fn should_feed_be_updated(
|
||||
new_places: &Vec<RemainingPlace>,
|
||||
last_places: &Mutex<Vec<RemainingPlace>>,
|
||||
) -> bool {
|
||||
let mut last_places = last_places.lock().unwrap();
|
||||
|
||||
let are_the_same_places = new_places.len() == last_places.len()
|
||||
&& new_places
|
||||
.iter()
|
||||
.zip(last_places.iter())
|
||||
.all(|(one, two)| one == two);
|
||||
|
||||
if !are_the_same_places {
|
||||
last_places.clear();
|
||||
last_places.append(&mut new_places.clone());
|
||||
}
|
||||
|
||||
!are_the_same_places
|
||||
}
|
31
src/html_parser.rs
Normal file
31
src/html_parser.rs
Normal file
@ -0,0 +1,31 @@
|
||||
pub fn parse_inner_node(inner_node: &str) -> String {
|
||||
let start = inner_node.find(">");
|
||||
let end = inner_node.rfind("<");
|
||||
|
||||
let mut start_index = 0;
|
||||
let mut end_index = inner_node.len() - 1;
|
||||
|
||||
if let Some(start) = start {
|
||||
start_index = start + 1;
|
||||
}
|
||||
|
||||
if let Some(end) = end {
|
||||
let new_end = end - 1;
|
||||
|
||||
if new_end >= start_index {
|
||||
end_index = new_end
|
||||
}
|
||||
}
|
||||
|
||||
let inner_inner_node = &inner_node[start_index..=end_index];
|
||||
|
||||
if inner_inner_node.contains("<") || inner_inner_node.contains(">") {
|
||||
return parse_inner_node(inner_inner_node);
|
||||
}
|
||||
|
||||
inner_inner_node.to_string()
|
||||
}
|
||||
|
||||
pub fn replace_html_codes(content: &str) -> String {
|
||||
content.replace(" ", " ")
|
||||
}
|
129
src/main.rs
129
src/main.rs
@ -1,18 +1,15 @@
|
||||
use clokwerk::{Scheduler, TimeUnits};
|
||||
use rss::{ChannelBuilder, ItemBuilder};
|
||||
use std::fs::File;
|
||||
use feed::should_feed_be_updated;
|
||||
use remaining_place::{get_current_places, RemainingPlace};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
struct RemainingPlace {
|
||||
id: String,
|
||||
description: String,
|
||||
date: String,
|
||||
free: usize,
|
||||
}
|
||||
use crate::feed::update_rss_file;
|
||||
|
||||
mod feed;
|
||||
mod html_parser;
|
||||
mod remaining_place;
|
||||
|
||||
struct Args {
|
||||
url: String,
|
||||
@ -22,8 +19,13 @@ struct Args {
|
||||
fn parse_args() -> Result<Args, pico_args::Error> {
|
||||
let mut pargs = pico_args::Arguments::from_env();
|
||||
|
||||
let url = pargs.opt_value_from_str("--url")?;
|
||||
|
||||
let args = Args {
|
||||
url: pargs.value_from_str("--url")?,
|
||||
url: match url {
|
||||
Some(val) => val,
|
||||
None => String::from("https://www.lfs.sachsen.de/restplatzboerse-5152.html"),
|
||||
},
|
||||
rss_file: pargs.value_from_str("--rss-file")?,
|
||||
};
|
||||
|
||||
@ -42,9 +44,8 @@ fn main() {
|
||||
let last_places: Arc<Mutex<Vec<RemainingPlace>>> = Arc::new(Mutex::new(Vec::new()));
|
||||
let mut scheduler = Scheduler::new();
|
||||
|
||||
scheduler
|
||||
.every(30.minutes())
|
||||
.run(move || match get_current_places(&args.url) {
|
||||
let check_for_places = move || {
|
||||
match get_current_places(&args.url) {
|
||||
Ok(places) => {
|
||||
if should_feed_be_updated(&places, &last_places) {
|
||||
update_rss_file(places, &args.url, &args.rss_file);
|
||||
@ -56,101 +57,17 @@ fn main() {
|
||||
Err(error) => {
|
||||
println!("Error: {}", error);
|
||||
}
|
||||
});
|
||||
};
|
||||
};
|
||||
|
||||
// now
|
||||
check_for_places();
|
||||
|
||||
// and every 30 min
|
||||
scheduler.every(30.minutes()).run(check_for_places);
|
||||
|
||||
loop {
|
||||
scheduler.run_pending();
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
}
|
||||
|
||||
fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
|
||||
let body = reqwest::blocking::get(url)?.text()?;
|
||||
|
||||
let start = body.find("<tbody").unwrap();
|
||||
let end = body.find("</tbody>").unwrap();
|
||||
|
||||
let table = &body[start..=(end + 7)];
|
||||
|
||||
let mut places: Vec<RemainingPlace> = Vec::new();
|
||||
let mut iter = table.lines();
|
||||
|
||||
while let Some(line) = iter.next() {
|
||||
if line.contains("<tr>") {
|
||||
let id = parse_node(iter.next().unwrap());
|
||||
let description = parse_node(iter.next().unwrap());
|
||||
let date = parse_node(iter.next().unwrap());
|
||||
let free = parse_node(iter.next().unwrap()).parse().unwrap();
|
||||
|
||||
let place = RemainingPlace {
|
||||
id,
|
||||
description,
|
||||
date,
|
||||
free,
|
||||
};
|
||||
places.push(place);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(places)
|
||||
}
|
||||
|
||||
fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
|
||||
let mut channel = ChannelBuilder::default()
|
||||
.title(String::from("LFS Restplatzbörse"))
|
||||
.link(url.to_string())
|
||||
.description(String::from(
|
||||
"Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
|
||||
))
|
||||
.language(Some("de-DE".to_string()))
|
||||
.build();
|
||||
|
||||
let title = format!("Restplatzbörse Update - {}", OffsetDateTime::now_local().unwrap().format(time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]")).unwrap());
|
||||
let content = places
|
||||
.iter()
|
||||
.map(|place| {
|
||||
format!(
|
||||
"{} - {} - {} - {} Plätze</br>",
|
||||
place.id, place.description, place.date, place.free
|
||||
)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n");
|
||||
|
||||
let item = ItemBuilder::default()
|
||||
.title(Some(title))
|
||||
.content(Some(content))
|
||||
.build();
|
||||
|
||||
channel.set_items(vec![item]);
|
||||
|
||||
let output = File::create(rss_file).unwrap();
|
||||
channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
|
||||
}
|
||||
|
||||
fn should_feed_be_updated(
|
||||
new_places: &Vec<RemainingPlace>,
|
||||
last_places: &Mutex<Vec<RemainingPlace>>,
|
||||
) -> bool {
|
||||
let mut last_places = last_places.lock().unwrap();
|
||||
|
||||
let are_the_same_places = new_places.len() == last_places.len()
|
||||
&& new_places
|
||||
.iter()
|
||||
.zip(last_places.iter())
|
||||
.all(|(one, two)| one == two);
|
||||
|
||||
if !are_the_same_places {
|
||||
last_places.clear();
|
||||
last_places.append(&mut new_places.clone());
|
||||
}
|
||||
|
||||
!are_the_same_places
|
||||
}
|
||||
|
||||
fn parse_node(input: &str) -> String {
|
||||
let start = input.find(">").unwrap();
|
||||
let end = input.find("</").unwrap();
|
||||
|
||||
input[(start + 1)..end].to_string()
|
||||
}
|
||||
|
53
src/remaining_place.rs
Normal file
53
src/remaining_place.rs
Normal file
@ -0,0 +1,53 @@
|
||||
use crate::html_parser::{parse_inner_node, replace_html_codes};
|
||||
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct RemainingPlace {
|
||||
pub id: String,
|
||||
pub description: String,
|
||||
pub date: String,
|
||||
pub free: usize,
|
||||
}
|
||||
|
||||
pub fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
|
||||
let body = reqwest::blocking::get(url)?.text()?;
|
||||
|
||||
let start = body.find("<tbody").unwrap();
|
||||
let end = body.find("</tbody>").unwrap();
|
||||
|
||||
let table = &body[start..=(end + 7)];
|
||||
|
||||
let mut places: Vec<RemainingPlace> = Vec::new();
|
||||
let mut lines: Vec<String> = Vec::new();
|
||||
|
||||
let mut line = table.replace("\n", "").replace("\r", "");
|
||||
|
||||
while let Some(begin) = line.find("<td") {
|
||||
match line.find("</td>") {
|
||||
Some(end) => {
|
||||
let inner_node = &line[(begin + 5)..end];
|
||||
let content = parse_inner_node(inner_node);
|
||||
let escaped_content = replace_html_codes(&content);
|
||||
|
||||
lines.push(escaped_content);
|
||||
line.replace_range(begin..=end + 5, "");
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
lines.chunks(4).for_each(|chunk| {
|
||||
let new_remaining_place = RemainingPlace {
|
||||
id: chunk[0].clone(),
|
||||
description: chunk[1].clone(),
|
||||
date: chunk[2].clone(),
|
||||
free: match chunk[3].parse() {
|
||||
Ok(value) => value,
|
||||
Err(_) => 0,
|
||||
},
|
||||
};
|
||||
|
||||
places.push(new_remaining_place);
|
||||
});
|
||||
|
||||
Ok(places)
|
||||
}
|
Loading…
Reference in New Issue
Block a user