refactor code and make scraping more robust
This commit is contained in:
parent
425ff11b5b
commit
6e3e3e6321
68
src/feed.rs
Normal file
68
src/feed.rs
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
use std::{fs::File, sync::Mutex};
|
||||||
|
|
||||||
|
use rss::{ChannelBuilder, ItemBuilder};
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
use crate::remaining_place::RemainingPlace;
|
||||||
|
|
||||||
|
pub fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
|
||||||
|
let mut channel = ChannelBuilder::default()
|
||||||
|
.title(String::from("LFS Restplatzbörse"))
|
||||||
|
.link(url.to_string())
|
||||||
|
.description(String::from(
|
||||||
|
"Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
|
||||||
|
))
|
||||||
|
.language(Some("de-DE".to_string()))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
let title = format!(
|
||||||
|
"Restplatzbörse Update - {}",
|
||||||
|
OffsetDateTime::now_local()
|
||||||
|
.unwrap()
|
||||||
|
.format(time::macros::format_description!(
|
||||||
|
"[year]-[month]-[day] [hour]:[minute]"
|
||||||
|
))
|
||||||
|
.unwrap()
|
||||||
|
);
|
||||||
|
|
||||||
|
let content = places
|
||||||
|
.iter()
|
||||||
|
.map(|place| {
|
||||||
|
format!(
|
||||||
|
"{} - {} - {} - {} Plätze</br>",
|
||||||
|
place.id, place.description, place.date, place.free
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<Vec<String>>()
|
||||||
|
.join("\n");
|
||||||
|
|
||||||
|
let item = ItemBuilder::default()
|
||||||
|
.title(Some(title))
|
||||||
|
.content(Some(content))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
channel.set_items(vec![item]);
|
||||||
|
|
||||||
|
let output = File::create(rss_file).unwrap();
|
||||||
|
channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn should_feed_be_updated(
|
||||||
|
new_places: &Vec<RemainingPlace>,
|
||||||
|
last_places: &Mutex<Vec<RemainingPlace>>,
|
||||||
|
) -> bool {
|
||||||
|
let mut last_places = last_places.lock().unwrap();
|
||||||
|
|
||||||
|
let are_the_same_places = new_places.len() == last_places.len()
|
||||||
|
&& new_places
|
||||||
|
.iter()
|
||||||
|
.zip(last_places.iter())
|
||||||
|
.all(|(one, two)| one == two);
|
||||||
|
|
||||||
|
if !are_the_same_places {
|
||||||
|
last_places.clear();
|
||||||
|
last_places.append(&mut new_places.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
!are_the_same_places
|
||||||
|
}
|
31
src/html_parser.rs
Normal file
31
src/html_parser.rs
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
pub fn parse_inner_node(inner_node: &str) -> String {
|
||||||
|
let start = inner_node.find(">");
|
||||||
|
let end = inner_node.rfind("<");
|
||||||
|
|
||||||
|
let mut start_index = 0;
|
||||||
|
let mut end_index = inner_node.len() - 1;
|
||||||
|
|
||||||
|
if let Some(start) = start {
|
||||||
|
start_index = start + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(end) = end {
|
||||||
|
let new_end = end - 1;
|
||||||
|
|
||||||
|
if new_end >= start_index {
|
||||||
|
end_index = new_end
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let inner_inner_node = &inner_node[start_index..=end_index];
|
||||||
|
|
||||||
|
if inner_inner_node.contains("<") || inner_inner_node.contains(">") {
|
||||||
|
return parse_inner_node(inner_inner_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
inner_inner_node.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn replace_html_codes(content: &str) -> String {
|
||||||
|
content.replace(" ", " ")
|
||||||
|
}
|
129
src/main.rs
129
src/main.rs
@ -1,18 +1,15 @@
|
|||||||
use clokwerk::{Scheduler, TimeUnits};
|
use clokwerk::{Scheduler, TimeUnits};
|
||||||
use rss::{ChannelBuilder, ItemBuilder};
|
use feed::should_feed_be_updated;
|
||||||
use std::fs::File;
|
use remaining_place::{get_current_places, RemainingPlace};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use time::OffsetDateTime;
|
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq)]
|
use crate::feed::update_rss_file;
|
||||||
struct RemainingPlace {
|
|
||||||
id: String,
|
mod feed;
|
||||||
description: String,
|
mod html_parser;
|
||||||
date: String,
|
mod remaining_place;
|
||||||
free: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Args {
|
struct Args {
|
||||||
url: String,
|
url: String,
|
||||||
@ -22,8 +19,13 @@ struct Args {
|
|||||||
fn parse_args() -> Result<Args, pico_args::Error> {
|
fn parse_args() -> Result<Args, pico_args::Error> {
|
||||||
let mut pargs = pico_args::Arguments::from_env();
|
let mut pargs = pico_args::Arguments::from_env();
|
||||||
|
|
||||||
|
let url = pargs.opt_value_from_str("--url")?;
|
||||||
|
|
||||||
let args = Args {
|
let args = Args {
|
||||||
url: pargs.value_from_str("--url")?,
|
url: match url {
|
||||||
|
Some(val) => val,
|
||||||
|
None => String::from("https://www.lfs.sachsen.de/restplatzboerse-5152.html"),
|
||||||
|
},
|
||||||
rss_file: pargs.value_from_str("--rss-file")?,
|
rss_file: pargs.value_from_str("--rss-file")?,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -42,9 +44,8 @@ fn main() {
|
|||||||
let last_places: Arc<Mutex<Vec<RemainingPlace>>> = Arc::new(Mutex::new(Vec::new()));
|
let last_places: Arc<Mutex<Vec<RemainingPlace>>> = Arc::new(Mutex::new(Vec::new()));
|
||||||
let mut scheduler = Scheduler::new();
|
let mut scheduler = Scheduler::new();
|
||||||
|
|
||||||
scheduler
|
let check_for_places = move || {
|
||||||
.every(30.minutes())
|
match get_current_places(&args.url) {
|
||||||
.run(move || match get_current_places(&args.url) {
|
|
||||||
Ok(places) => {
|
Ok(places) => {
|
||||||
if should_feed_be_updated(&places, &last_places) {
|
if should_feed_be_updated(&places, &last_places) {
|
||||||
update_rss_file(places, &args.url, &args.rss_file);
|
update_rss_file(places, &args.url, &args.rss_file);
|
||||||
@ -56,101 +57,17 @@ fn main() {
|
|||||||
Err(error) => {
|
Err(error) => {
|
||||||
println!("Error: {}", error);
|
println!("Error: {}", error);
|
||||||
}
|
}
|
||||||
});
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// now
|
||||||
|
check_for_places();
|
||||||
|
|
||||||
|
// and every 30 min
|
||||||
|
scheduler.every(30.minutes()).run(check_for_places);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
scheduler.run_pending();
|
scheduler.run_pending();
|
||||||
thread::sleep(Duration::from_millis(10));
|
thread::sleep(Duration::from_millis(10));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
|
|
||||||
let body = reqwest::blocking::get(url)?.text()?;
|
|
||||||
|
|
||||||
let start = body.find("<tbody").unwrap();
|
|
||||||
let end = body.find("</tbody>").unwrap();
|
|
||||||
|
|
||||||
let table = &body[start..=(end + 7)];
|
|
||||||
|
|
||||||
let mut places: Vec<RemainingPlace> = Vec::new();
|
|
||||||
let mut iter = table.lines();
|
|
||||||
|
|
||||||
while let Some(line) = iter.next() {
|
|
||||||
if line.contains("<tr>") {
|
|
||||||
let id = parse_node(iter.next().unwrap());
|
|
||||||
let description = parse_node(iter.next().unwrap());
|
|
||||||
let date = parse_node(iter.next().unwrap());
|
|
||||||
let free = parse_node(iter.next().unwrap()).parse().unwrap();
|
|
||||||
|
|
||||||
let place = RemainingPlace {
|
|
||||||
id,
|
|
||||||
description,
|
|
||||||
date,
|
|
||||||
free,
|
|
||||||
};
|
|
||||||
places.push(place);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(places)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn update_rss_file(places: Vec<RemainingPlace>, url: &str, rss_file: &str) {
|
|
||||||
let mut channel = ChannelBuilder::default()
|
|
||||||
.title(String::from("LFS Restplatzbörse"))
|
|
||||||
.link(url.to_string())
|
|
||||||
.description(String::from(
|
|
||||||
"Ein RSS Feed der Restplatzbörse der Landesfeuerwehrschule Sachsen. Nicht offiziell.",
|
|
||||||
))
|
|
||||||
.language(Some("de-DE".to_string()))
|
|
||||||
.build();
|
|
||||||
|
|
||||||
let title = format!("Restplatzbörse Update - {}", OffsetDateTime::now_local().unwrap().format(time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]")).unwrap());
|
|
||||||
let content = places
|
|
||||||
.iter()
|
|
||||||
.map(|place| {
|
|
||||||
format!(
|
|
||||||
"{} - {} - {} - {} Plätze</br>",
|
|
||||||
place.id, place.description, place.date, place.free
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Vec<String>>()
|
|
||||||
.join("\n");
|
|
||||||
|
|
||||||
let item = ItemBuilder::default()
|
|
||||||
.title(Some(title))
|
|
||||||
.content(Some(content))
|
|
||||||
.build();
|
|
||||||
|
|
||||||
channel.set_items(vec![item]);
|
|
||||||
|
|
||||||
let output = File::create(rss_file).unwrap();
|
|
||||||
channel.pretty_write_to(output, ' ' as u8, 2).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
fn should_feed_be_updated(
|
|
||||||
new_places: &Vec<RemainingPlace>,
|
|
||||||
last_places: &Mutex<Vec<RemainingPlace>>,
|
|
||||||
) -> bool {
|
|
||||||
let mut last_places = last_places.lock().unwrap();
|
|
||||||
|
|
||||||
let are_the_same_places = new_places.len() == last_places.len()
|
|
||||||
&& new_places
|
|
||||||
.iter()
|
|
||||||
.zip(last_places.iter())
|
|
||||||
.all(|(one, two)| one == two);
|
|
||||||
|
|
||||||
if !are_the_same_places {
|
|
||||||
last_places.clear();
|
|
||||||
last_places.append(&mut new_places.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
!are_the_same_places
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_node(input: &str) -> String {
|
|
||||||
let start = input.find(">").unwrap();
|
|
||||||
let end = input.find("</").unwrap();
|
|
||||||
|
|
||||||
input[(start + 1)..end].to_string()
|
|
||||||
}
|
|
||||||
|
53
src/remaining_place.rs
Normal file
53
src/remaining_place.rs
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
use crate::html_parser::{parse_inner_node, replace_html_codes};
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq)]
|
||||||
|
pub struct RemainingPlace {
|
||||||
|
pub id: String,
|
||||||
|
pub description: String,
|
||||||
|
pub date: String,
|
||||||
|
pub free: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_current_places(url: &str) -> Result<Vec<RemainingPlace>, reqwest::Error> {
|
||||||
|
let body = reqwest::blocking::get(url)?.text()?;
|
||||||
|
|
||||||
|
let start = body.find("<tbody").unwrap();
|
||||||
|
let end = body.find("</tbody>").unwrap();
|
||||||
|
|
||||||
|
let table = &body[start..=(end + 7)];
|
||||||
|
|
||||||
|
let mut places: Vec<RemainingPlace> = Vec::new();
|
||||||
|
let mut lines: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
let mut line = table.replace("\n", "").replace("\r", "");
|
||||||
|
|
||||||
|
while let Some(begin) = line.find("<td") {
|
||||||
|
match line.find("</td>") {
|
||||||
|
Some(end) => {
|
||||||
|
let inner_node = &line[(begin + 5)..end];
|
||||||
|
let content = parse_inner_node(inner_node);
|
||||||
|
let escaped_content = replace_html_codes(&content);
|
||||||
|
|
||||||
|
lines.push(escaped_content);
|
||||||
|
line.replace_range(begin..=end + 5, "");
|
||||||
|
}
|
||||||
|
None => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.chunks(4).for_each(|chunk| {
|
||||||
|
let new_remaining_place = RemainingPlace {
|
||||||
|
id: chunk[0].clone(),
|
||||||
|
description: chunk[1].clone(),
|
||||||
|
date: chunk[2].clone(),
|
||||||
|
free: match chunk[3].parse() {
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(_) => 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
places.push(new_remaining_place);
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(places)
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user