event-scraper/src/main.rs

273 lines
8.7 KiB
Rust

#[macro_use]
extern crate clap;
use reqwest;
use scraper::{Html, Selector, ElementRef};
use serde_json::{Value, self};
use itertools::izip;
use chrono::{NaiveDateTime, DateTime, Local, Utc, offset::TimeZone, Datelike};
use std::collections::HashMap;
use std::path::Path;
use std::fs::File;
use std::io::{self, Write};
use clap::{Arg, App, ArgMatches};
const OUT_DIR_DEFAULT: &'static str = "out";
const FETCH_MONTHS_DEFAULT: u32 = 3;
const OTAKU_URL: &'static str = "https://otakulounge.com/wp-admin/admin-ajax.php";
struct Config {
out_dir: String,
ym_start: (u32, u32),
fetch_months: u32,
}
impl Config {
fn from_arg_matches<'a>(matches: ArgMatches<'a>) -> Self {
let out_dir = matches.value_of("outdir").unwrap_or(OUT_DIR_DEFAULT).to_string();
let fetch_months = value_t!(matches, "fetch", u32).unwrap_or(FETCH_MONTHS_DEFAULT);
let date_start = matches.value_of("date")
.map_or_else(
|| Local::now(),
|date_str| {
Local
.datetime_from_str(&format!("{}-01 00:00", date_str), "%Y-%m-%d %H:%M")
.expect("Unable to parse date")
});
let ym_start = (date_start.year() as u32, date_start.month());
Config {
out_dir,
ym_start,
fetch_months
}
}
}
fn parse_args<'a>() -> ArgMatches<'a> {
App::new("Otaku Event Scraper")
.version("0.1")
.about("Scrap events from otakulounge.com and create corresponding iCal files")
.arg(Arg::with_name("outdir")
.short("o")
.long("outdir")
.value_name("DIR")
.help(&format!{"Set output directory for *.isc files, default to `{}`", OUT_DIR_DEFAULT})
.takes_value(true))
.arg(Arg::with_name("fetch")
.short("f")
.long("fetch")
.value_name("MONTHS")
.help("Number of months to fetch the events from, default to 3")
.takes_value(true)
.min_values(1)
.max_values(12))
.arg(Arg::with_name("date")
.short("d")
.long("date")
.value_name("DATE")
.help("Start date to fetch the events, default for today. Format YYYY-MM")
.takes_value(true))
.get_matches()
}
fn get_config() -> Config {
Config::from_arg_matches(parse_args())
}
fn get_events_html((year, month): (u32, u32)) -> Result<String, reqwest::Error> {
println!("querying otakulounge.com for month {}-{}", year, month);
let mut res = reqwest::Client::new()
.post(OTAKU_URL)
.form(&vec![
("action", "mec_monthly_view_load_month"),
("mec_year", &year.to_string()),
("mec_month", &month.to_string()),
("atts[label]", ""),
("atts[category]", ""),
("atts[location]", ""),
("atts[organizer]", ""),
("atts[tag]", ""),
("atts[author]", ""),
("atts[skin]", "full_calendar"),
("atts[show_past_events]", "1"),
("atts[sf_status]", "0"),
("atts[id]", "1019"),
("atts[s]", ""),
("atts[append_js_codes]", "1"),
("atts[sed_method]", "0"),
("atts[image_popup]", "0"),
("apply_sf_date", "0")
])
.send()?;
println!("status: {}", res.status());
let body: Value = res.json()?;
let body_text = body["events_side"].as_str().unwrap().to_string();
Ok(body_text)
}
fn add_to_month((year, month): (u32, u32), n: u32) -> (u32, u32) {
(year + (month + n - 1) / 12, (month + n - 1) % 12 + 1)
}
fn get_inner_text(el: ElementRef) -> String {
el.text().collect::<Vec<_>>().join("")
}
fn get_description_from_bad_json_ld(el: ElementRef) -> Result<String, serde_json::Error> {
let inner_text = get_inner_text(el);
let json_event: Value = serde_json::from_str(&inner_text.replace('\n', ""))?;
let serialized_description = json_event["description"].as_str().unwrap();
let html_src = get_inner_text(Html::parse_fragment(serialized_description).root_element());
let html_description = Html::parse_fragment(&html_src);
let span_selector = Selector::parse("span").unwrap();
Ok(html_description
.select(&span_selector)
.map(get_inner_text)
.collect::<Vec<_>>()
.join("\\n"))
}
fn parse_to_utc(date: &str, time: &str) -> DateTime<Utc> {
let dt = NaiveDateTime::parse_from_str(&format!("{} {}", date, time), "%Y%m%d %l:%M %P").unwrap();
let local = Local.from_local_datetime(&dt).single().unwrap();
local.with_timezone(&Utc)
}
fn create_event(title: &str, description: &str, date: &str, time: String, id: u8) -> String {
println!("{:#?} -> {:#?}", title, time);
let (start, end) = split_time(&time);
let date_start = parse_to_utc(date, start);
let date_end = parse_to_utc(date, end);
let timestamp = Utc::now();
let uid = format!("day{}event{}@otaku-event.bksp.space", date, id);
format!(r#"BEGIN:VEVENT
DTSTART:{}
DTEND:{}
DTSTAMP:{}
UID:{}
SUMMARY:{}
DESCRIPTION:{}
LOCATION:Otaku Manga Lounge
END:VEVENT"#,
date_start.format("%Y%m%dT%H%M%SZ"),
date_end.format("%Y%m%dT%H%M%SZ"),
timestamp.format("%Y%m%dT%H%M%SZ"),
uid,
title,
description
)
}
fn split_time(time: &str) -> (&str, &str) {
if let Some(dash_idx) = time.find('-') {
let start = &time[1..(dash_idx - 1)];
let end = &time[(dash_idx + 2)..];
(start, end)
} else {
(&time[1..], "10:00 pm")
}
}
fn write_cal(folder: &str, name: &str, events: &Vec<String>) -> io::Result<String> {
let filename = Path::new(folder).join(name).with_extension("isc");
println!("writing {:?}", filename);
let mut file = File::create(&filename)?;
file.write_all(b"BEGIN:VCALENDAR\nVERSION:2.0\nPRODID:-//bksp.space - EventScraper//\n")?;
for event in events {
write!(file, "{}\n", event)?;
}
file.write_all(b"END:VCALENDAR")?;
Ok(filename.file_name().unwrap().to_str().unwrap().to_string())
}
fn write_index(folder: &str, events: Vec<(String, String)>) -> io::Result<()> {
let mut index = File::create(Path::new(folder).join("index.html"))?;
index.write_all(br#"
<!doctype html>
<html>
<head>
<meta charse="ut8">
<title>Otaku Event Calendars</title>
</head>
<body>
<h1>Otaku Event Calendars</h1>
<ul>
"#)?;
for (url, name) in events {
write!(index, r#"<li><a href="{}">{}</a></li>"#, url, name)?;
}
index.write_all(br#"
</ul>
</body>
</html>
"#)?;
Ok(())
}
fn main() -> io::Result<()> {
let config = get_config();
let frags_iter = (0..config.fetch_months).map(|n| add_to_month(config.ym_start, n))
.filter_map(|ym| get_events_html(ym).ok())
.map(|body| Html::parse_fragment(&body));
let days_selector = Selector::parse(".mec-calendar-events-sec").unwrap();
let title_selector = Selector::parse("h4.mec-event-title").unwrap();
let time_selector = Selector::parse(".mec-event-time").unwrap();
let json_ld_selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
let mut activities = Vec::new();
let mut all_courses = Vec::new();
let mut courses = HashMap::new();
for fragment in frags_iter {
let days = fragment.select(&days_selector);
for day in days {
let date = day.value().attr("data-mec-cell").unwrap_or_default();
let titles = day.select(&title_selector).map(get_inner_text);
let times = day.select(&time_selector).map(get_inner_text);
let descriptions = day.select(&json_ld_selector).map(get_description_from_bad_json_ld);
let mut idx = 0u8;
for (title, time, description) in izip!(titles, times, descriptions) {
let event = create_event(&title, &description.unwrap_or_default(), &date, time, idx);
if let Some(0) = title.find("Cours") {
all_courses.push(event.clone());
let this_course = courses.entry(title).or_insert(Vec::<String>::new());
this_course.push(event);
} else {
activities.push(event);
}
idx += 1;
}
}
}
let mut event_names = Vec::new();
event_names.push((write_cal(&config.out_dir, "activities", &activities)?, "Activities".into()));
event_names.push((write_cal(&config.out_dir, "all_courses", &activities)?, "All courses".into()));
for (title, events) in courses.iter() {
event_names.push((write_cal(&config.out_dir, title, &events)?, title.into()));
}
write_index(&config.out_dir, event_names)?;
Ok(())
}