feat: make scraper script OS-agnostic for consistent execution (#224)

cesium · Sep 8, 2024 · a7c9748 · a7c9748
1 parent 23e5404
commit a7c9748
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 21 deletions.
diff --git a/scraper/README.md b/scraper/README.md
@@ -35,7 +35,7 @@ sudo pacman -S geckodriver firefox # Arch
 
 | package     | usage                                                                                                                                      |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| requests    | To download previous commits files from our GitHub page and scrape subjects short names                                                    |
+| requests    | To download previously commmitted files from our GitHub page and scrape subjects short names                                                    |
 | unidecode   | To create short names to subjects (that weren't scraped), removing accents from chars. Ex.: Álgebra Linear para a Engenharia -> ÁLE -> ALE |
 | selenium    | Used to scrape the webpage. On this case is impossible use libraries like `beautifulsoup` due the web stack used by UMinho                 |
 | geckodriver | A selenium dependency to interact with browsers                                                                                            |
@@ -51,17 +51,17 @@ $ python scraper/main.py
 
 ##### Subjects Short Names
 
-[Calendarium](https://calendario.cesium.di.uminho.pt/) use some short names to easily identify some subjects. This names were chosen on previous versions of `filters.json`. The scrap can be done combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when this files were a manual scrap) from [Calendarium Github Page](https://github.com/cesium/calendarium).
+[Calendarium](https://calendario.cesium.di.uminho.pt/) use some short names to easily identify some subjects. These names were chosen on previous versions of `filters.json`. The scrape can be done combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when these files were a manual scrape) from [Calendarium Github Page](https://github.com/cesium/calendarium).
 
-If not founded, `scraper/subjects_short_names.json` will be generated by the schedule scraper. Read more at [subjects short names](./modules/README.md#subjects_short_names).
+If not found, `scraper/subjects_short_names.json` will be generated by the schedule scraper. Read more at [subjects short names](./modules/README.md#subjects_short_names).
 
 ###### You can add manually names to this list
 
 ##### Subject IDs and Filter Ids
 
-[Calendarium](https://calendario.cesium.di.uminho.pt/) use a subject ID and a filterID. On UMinho Courses pages, a list of all subjects, ordered first by year/semesters and next by alphabetic order, and the subject IDs are given. This is everything we need to complete `shifts.json` and generate a basic `filters.json` to Calendarium.
+[Calendarium](https://calendario.cesium.di.uminho.pt/) uses a subject ID and a filterID. On UMinho Courses pages, a list of all subjects, ordered first by year/semesters and next by alphabetic order, and the subject IDs are given. This is everything we need to complete `shifts.json` and generate a basic `filters.json` to Calendarium.
 
-If not founded, `scraper/subjects.json` will be generated by the schedule scraper. Read more at [subjects scraper documentation](./modules/README.md#subject-id-and-a-filter-id-scraper).
+If not found, `scraper/subjects.json` will be generated by the schedule scraper. Read more at [subjects scraper documentation](./modules/README.md#subject-id-and-a-filter-id-scraper).
 
 ###### You can add manually subjects to this list
 

diff --git a/scraper/main.py b/scraper/main.py
@@ -2,7 +2,7 @@
 
 from selenium import webdriver
 
-from os import chdir
+from os import chdir, path
 import json
 
 from modules.subjects_scraper import subjects_scraper
@@ -11,7 +11,7 @@
 
 
 # To prevent paths problems, the code need be executed from project root
-chdir(__file__.replace("scraper/main.py", ""))
+chdir(path.abspath(path.join(path.dirname(path.abspath(__file__)), "..")))
 
 print("Welcome to UMinho Schedule Scraper!")
 
@@ -33,14 +33,14 @@
 shifts += course_scraper(driver,
                          "Mestrado em Engenharia Informática", subject_codes)
 
-with open("data/shifts.json", "w") as outfile:
+with open(path.join("data", "shifts.json"), "w", encoding="utf-8") as outfile:
     json.dump(shifts, outfile, indent=2, ensure_ascii=False)
 
 print(f"\nDone. Scraped {len(shifts)} shifts from the schedules!")
 print(f"Check them at data/shifts.json\n")
 
 filters = create_filters(shifts, subjects)
-with open("data/filters.json", "w") as outfile:
+with open(path.join("data", "filters.json"), "w", encoding="utf-8") as outfile:
     json.dump(filters, outfile, indent=2, ensure_ascii=False)
 
 print(f"\nDone. Stored {len(filters)} filters!")

diff --git a/scraper/modules/README.md b/scraper/modules/README.md
@@ -2,15 +2,15 @@
 
 ##### (subjects_short_names_scraper.py)
 
-[Calendarium](https://calendario.cesium.di.uminho.pt/) use some short names to easily identify some subjects. This names were chosen on previous versions of `filters.json`.
+[Calendarium](https://calendario.cesium.di.uminho.pt/) uses some short names to easily identify some subjects. These names were chosen on previous versions of `filters.json`.
 
 ### Scraping this values
 
-The scrap can be done combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when this files were a manual scrap) from [Calendarium Github Page](https://github.com/cesium/calendarium).
+The scrape can be done by combining the files `data/filter.json` and `data/shifts.json` from a specific commit (when these files were a manual scrape) from [Calendarium Github Page](https://github.com/cesium/calendarium).
 
 #### Adding manual values
 
-If for some reason you want add some subjects (a new one) to this scrap, you can edit the dictionary `manual_subject_names` at `scraper/modules/subjects_short_names_scraper.py` file. Follow the next schema:
+If for some reason you want add some subjects (a new one) to this scrape, you can edit the dictionary `manual_subject_names` at `scraper/modules/subjects_short_names_scraper.py` file. Follow the next schema:
 
 ```python
 manual_subject_names = {
@@ -23,7 +23,7 @@ manual_subject_names = {
 
 #### Output
 
-If not founded, `scraper/subjects_short_names.json` will be generated by the schedule scraper.
+If not found, `scraper/subjects_short_names.json` will be generated by the schedule scraper.
 
 ## Subject ID and a Filter ID Scraper
 
@@ -35,7 +35,7 @@ If not founded, `scraper/subjects_short_names.json` will be generated by the sch
 filterId = f"{university_year}{university_semester}{subject_code}"
 ```
 
-Where the `subject code` is the position of the subject in an alphabetic ordered list. For example:
+Where the `subject code` is the position of the subject in an alphabetical ordered list. For example:
 
 ```python
 # 1st year & 1st semester subjects:

diff --git a/scraper/modules/schedule_scraper.py b/scraper/modules/schedule_scraper.py
@@ -12,7 +12,7 @@ def schedule_scraper(driver: WebDriver, subject_codes: list[dict[str, int]]):
     Parameters
     ----------
     driver : WebDriver
-      The selenium driver. Need have the schedule ready
+      The selenium driver. Needs to have the schedule ready
 
     subject_codes : list[dict[str, int]]
       Every subject has its subject ID and filter ID. This IDs are stored on a list of dicts with the format:

diff --git a/scraper/modules/subjects_scraper.py b/scraper/modules/subjects_scraper.py
@@ -12,6 +12,7 @@
 from time import sleep
 from unidecode import unidecode
 from collections import Counter
+from os import path
 
 
 def subjects_scraper(driver: WebDriver):
@@ -35,13 +36,15 @@ def subjects_scraper(driver: WebDriver):
     }]
     """
 
+    subjects_short_names_path = path.join("scraper", "subjects_short_names.json")
+
     # To compatibility with old version of Calendarium, we use the subjects short names available at GitHub
     try:
         subjects_short_names = json.load(
-            open('scraper/subjects_short_names.json'))
+            open(subjects_short_names_path, encoding="utf-8"))
     except FileNotFoundError:
         get_subjects_short_names_scraper()
-        subjects_short_names = json.load(open('scraper/subjects_short_names.json'))
+        subjects_short_names = json.load(open(subjects_short_names_path, encoding="utf-8"))
 
     # This function will store the return at a file. If the file already exists, we can skip this function
     try:
@@ -87,7 +90,7 @@ def subjects_scraper(driver: WebDriver):
     # =====================
 
     # Store the subjects
-    with open("scraper/subjects.json", "w") as outfile:
+    with open(path.join("scraper", "subjects.json"), "w", encoding="utf-8") as outfile:
         json.dump(subjects, outfile, indent=2, ensure_ascii=False)
 
     print(f"\nDone. Scraped {len(subjects)} subjects from the UMinho page!")
@@ -269,7 +272,7 @@ def scraper(driver: WebDriver, course_name: str, short_names, master: bool = Fal
 
 
 def get_subject_codes_from_file():
-    subjects_file = open("scraper/subjects.json", "r")
+    subjects_file = open(path.join("scraper", "subjects.json"), "r", encoding="utf-8")
 
     subjects = json.load(subjects_file)
     subject_codes = {}

diff --git a/scraper/modules/subjects_short_names_scraper.py b/scraper/modules/subjects_short_names_scraper.py
@@ -2,6 +2,7 @@
 
 import json
 from requests import get
+from os import path
 
 manual_subject_names = {
 
@@ -94,7 +95,7 @@ def get_subjects_short_names_scraper():
 
     names = {}
 
-    print("Not founded info on `shifts.json` about:")
+    print("Couldn't find info on `shifts.json` about:")
 
     for subject in filters:
         filter_id = subject["id"]
@@ -121,7 +122,7 @@ def get_subjects_short_names_scraper():
     for subject in manual_subject_names.values():
         print("\t" + subject['name'])
 
-    with open("scraper/subjects_short_names.json", "w") as outfile:
+    with open(path.join("scraper", "subjects_short_names.json"), "w", encoding="utf-8") as outfile:
         json.dump(names, outfile, indent=2, ensure_ascii=False)
 
     print(f"\nDone. Stored {len(names)} names!")