From a7c9748fcccef4b52748a1afa5f8f9c10d663cec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Lobo?= <30907944+joaodiaslobo@users.noreply.github.com> Date: Sun, 8 Sep 2024 19:06:04 +0100 Subject: [PATCH] feat: make scraper script OS-agnostic for consistent execution (#224) --- scraper/README.md | 10 +++++----- scraper/main.py | 8 ++++---- scraper/modules/README.md | 10 +++++----- scraper/modules/schedule_scraper.py | 2 +- scraper/modules/subjects_scraper.py | 11 +++++++---- scraper/modules/subjects_short_names_scraper.py | 5 +++-- 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/scraper/README.md b/scraper/README.md index 8c7e588a..48f9c44b 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -35,7 +35,7 @@ sudo pacman -S geckodriver firefox # Arch | package | usage | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ | -| requests | To download previous commits files from our GitHub page and scrape subjects short names | +| requests | To download previously commmitted files from our GitHub page and scrape subjects short names | | unidecode | To create short names to subjects (that weren't scraped), removing accents from chars. Ex.: Álgebra Linear para a Engenharia -> ÁLE -> ALE | | selenium | Used to scrape the webpage. On this case is impossible use libraries like `beautifulsoup` due the web stack used by UMinho | | geckodriver | A selenium dependency to interact with browsers | @@ -51,17 +51,17 @@ $ python scraper/main.py ##### Subjects Short Names -[Calendarium](https://calendario.cesium.di.uminho.pt/) use some short names to easily identify some subjects. This names were chosen on previous versions of `filters.json`. The scrap can be done combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when this files were a manual scrap) from [Calendarium Github Page](https://github.com/cesium/calendarium). +[Calendarium](https://calendario.cesium.di.uminho.pt/) use some short names to easily identify some subjects. These names were chosen on previous versions of `filters.json`. The scrape can be done combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when these files were a manual scrape) from [Calendarium Github Page](https://github.com/cesium/calendarium). -If not founded, `scraper/subjects_short_names.json` will be generated by the schedule scraper. Read more at [subjects short names](./modules/README.md#subjects_short_names). +If not found, `scraper/subjects_short_names.json` will be generated by the schedule scraper. Read more at [subjects short names](./modules/README.md#subjects_short_names). ###### You can add manually names to this list ##### Subject IDs and Filter Ids -[Calendarium](https://calendario.cesium.di.uminho.pt/) use a subject ID and a filterID. On UMinho Courses pages, a list of all subjects, ordered first by year/semesters and next by alphabetic order, and the subject IDs are given. This is everything we need to complete `shifts.json` and generate a basic `filters.json` to Calendarium. +[Calendarium](https://calendario.cesium.di.uminho.pt/) uses a subject ID and a filterID. On UMinho Courses pages, a list of all subjects, ordered first by year/semesters and next by alphabetic order, and the subject IDs are given. This is everything we need to complete `shifts.json` and generate a basic `filters.json` to Calendarium. -If not founded, `scraper/subjects.json` will be generated by the schedule scraper. Read more at [subjects scraper documentation](./modules/README.md#subject-id-and-a-filter-id-scraper). +If not found, `scraper/subjects.json` will be generated by the schedule scraper. Read more at [subjects scraper documentation](./modules/README.md#subject-id-and-a-filter-id-scraper). ###### You can add manually subjects to this list diff --git a/scraper/main.py b/scraper/main.py index ee5a0790..617f82bc 100644 --- a/scraper/main.py +++ b/scraper/main.py @@ -2,7 +2,7 @@ from selenium import webdriver -from os import chdir +from os import chdir, path import json from modules.subjects_scraper import subjects_scraper @@ -11,7 +11,7 @@ # To prevent paths problems, the code need be executed from project root -chdir(__file__.replace("scraper/main.py", "")) +chdir(path.abspath(path.join(path.dirname(path.abspath(__file__)), ".."))) print("Welcome to UMinho Schedule Scraper!") @@ -33,14 +33,14 @@ shifts += course_scraper(driver, "Mestrado em Engenharia Informática", subject_codes) -with open("data/shifts.json", "w") as outfile: +with open(path.join("data", "shifts.json"), "w", encoding="utf-8") as outfile: json.dump(shifts, outfile, indent=2, ensure_ascii=False) print(f"\nDone. Scraped {len(shifts)} shifts from the schedules!") print(f"Check them at data/shifts.json\n") filters = create_filters(shifts, subjects) -with open("data/filters.json", "w") as outfile: +with open(path.join("data", "filters.json"), "w", encoding="utf-8") as outfile: json.dump(filters, outfile, indent=2, ensure_ascii=False) print(f"\nDone. Stored {len(filters)} filters!") diff --git a/scraper/modules/README.md b/scraper/modules/README.md index 31fb9c81..673fa7b3 100644 --- a/scraper/modules/README.md +++ b/scraper/modules/README.md @@ -2,15 +2,15 @@ ##### (subjects_short_names_scraper.py) -[Calendarium](https://calendario.cesium.di.uminho.pt/) use some short names to easily identify some subjects. This names were chosen on previous versions of `filters.json`. +[Calendarium](https://calendario.cesium.di.uminho.pt/) uses some short names to easily identify some subjects. These names were chosen on previous versions of `filters.json`. ### Scraping this values -The scrap can be done combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when this files were a manual scrap) from [Calendarium Github Page](https://github.com/cesium/calendarium). +The scrape can be done by combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when these files were a manual scrape) from [Calendarium Github Page](https://github.com/cesium/calendarium). #### Adding manual values -If for some reason you want add some subjects (a new one) to this scrap, you can edit the dictionary `manual_subject_names` at `scraper/modules/subjects_short_names_scraper.py` file. Follow the next schema: +If for some reason you want add some subjects (a new one) to this scrape, you can edit the dictionary `manual_subject_names` at `scraper/modules/subjects_short_names_scraper.py` file. Follow the next schema: ```python manual_subject_names = { @@ -23,7 +23,7 @@ manual_subject_names = { #### Output -If not founded, `scraper/subjects_short_names.json` will be generated by the schedule scraper. +If not found, `scraper/subjects_short_names.json` will be generated by the schedule scraper. ## Subject ID and a Filter ID Scraper @@ -35,7 +35,7 @@ If not founded, `scraper/subjects_short_names.json` will be generated by the sch filterId = f"{university_year}{university_semester}{subject_code}" ``` -Where the `subject code` is the position of the subject in an alphabetic ordered list. For example: +Where the `subject code` is the position of the subject in an alphabetical ordered list. For example: ```python # 1st year & 1st semester subjects: diff --git a/scraper/modules/schedule_scraper.py b/scraper/modules/schedule_scraper.py index c87762e1..a1b45bff 100644 --- a/scraper/modules/schedule_scraper.py +++ b/scraper/modules/schedule_scraper.py @@ -12,7 +12,7 @@ def schedule_scraper(driver: WebDriver, subject_codes: list[dict[str, int]]): Parameters ---------- driver : WebDriver - The selenium driver. Need have the schedule ready + The selenium driver. Needs to have the schedule ready subject_codes : list[dict[str, int]] Every subject has its subject ID and filter ID. This IDs are stored on a list of dicts with the format: diff --git a/scraper/modules/subjects_scraper.py b/scraper/modules/subjects_scraper.py index ec08c46b..7d63cf5d 100644 --- a/scraper/modules/subjects_scraper.py +++ b/scraper/modules/subjects_scraper.py @@ -12,6 +12,7 @@ from time import sleep from unidecode import unidecode from collections import Counter +from os import path def subjects_scraper(driver: WebDriver): @@ -35,13 +36,15 @@ def subjects_scraper(driver: WebDriver): }] """ + subjects_short_names_path = path.join("scraper", "subjects_short_names.json") + # To compatibility with old version of Calendarium, we use the subjects short names available at GitHub try: subjects_short_names = json.load( - open('scraper/subjects_short_names.json')) + open(subjects_short_names_path, encoding="utf-8")) except FileNotFoundError: get_subjects_short_names_scraper() - subjects_short_names = json.load(open('scraper/subjects_short_names.json')) + subjects_short_names = json.load(open(subjects_short_names_path, encoding="utf-8")) # This function will store the return at a file. If the file already exists, we can skip this function try: @@ -87,7 +90,7 @@ def subjects_scraper(driver: WebDriver): # ===================== # Store the subjects - with open("scraper/subjects.json", "w") as outfile: + with open(path.join("scraper", "subjects.json"), "w", encoding="utf-8") as outfile: json.dump(subjects, outfile, indent=2, ensure_ascii=False) print(f"\nDone. Scraped {len(subjects)} subjects from the UMinho page!") @@ -269,7 +272,7 @@ def scraper(driver: WebDriver, course_name: str, short_names, master: bool = Fal def get_subject_codes_from_file(): - subjects_file = open("scraper/subjects.json", "r") + subjects_file = open(path.join("scraper", "subjects.json"), "r", encoding="utf-8") subjects = json.load(subjects_file) subject_codes = {} diff --git a/scraper/modules/subjects_short_names_scraper.py b/scraper/modules/subjects_short_names_scraper.py index bcdf323e..b0d93231 100644 --- a/scraper/modules/subjects_short_names_scraper.py +++ b/scraper/modules/subjects_short_names_scraper.py @@ -2,6 +2,7 @@ import json from requests import get +from os import path manual_subject_names = { @@ -94,7 +95,7 @@ def get_subjects_short_names_scraper(): names = {} - print("Not founded info on `shifts.json` about:") + print("Couldn't find info on `shifts.json` about:") for subject in filters: filter_id = subject["id"] @@ -121,7 +122,7 @@ def get_subjects_short_names_scraper(): for subject in manual_subject_names.values(): print("\t" + subject['name']) - with open("scraper/subjects_short_names.json", "w") as outfile: + with open(path.join("scraper", "subjects_short_names.json"), "w", encoding="utf-8") as outfile: json.dump(names, outfile, indent=2, ensure_ascii=False) print(f"\nDone. Stored {len(names)} names!")