1 mesiac pred · 72fcd39e7e
--- a/README.md
+++ b/README.md
@@ -6,7 +6,19 @@ Because the original repo only contains frontend code and data updates were slow
 
				 
			
 
				 ## Basic Usage
			
 
				 
			
 
				-After installing the [python3](https://www.python.org/) dependencies in `requirements.txt`, execute `parse_class_pdf.py` to parse a class timetable pdf. It reads from the filename constant `CLASS_PDF_INPUT_FILE` defined in `config/constants.py` and outputs in `CLASSES_JSON_OUTPUT_FILE`. By default, these files are `klassen.pdf` and `classes.json`.
			
 
				+After installing the [python3](https://www.python.org/) dependencies in `requirements.txt`, execute `parse_class_pdf.py` to parse a class timetable PDF.
			
 
				+
			
 
				+```bash
			
 
				+./parse_class_pdf.py [-h] [-l LECTURERS] [-i INPUT] [-o OUTPUT] [lecturers_pos]
			
 
				+```
			
 
				+
			
 
				+### Arguments
			
 
				+
			
 
				+- `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
			
 
				+- `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
			
 
				+- `-l`, `--lecturers` or `lecturers_pos`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
			
 
				+
			
 
				+The default values for input and output files are defined in `config/constants.py`.
			
 
				 
			
 
				 ## Project Roadmap
			
 
				 
			
--- a/parse/__init__.py
+++ b/parse/__init__.py
@@ -1,3 +1,7 @@
 
				 from .table_extraction import extract_data_from_class_pdf
			
 
				-from .parse_modules import get_modules_json, get_modules_for_class_json
			
 
				+from .parse_modules import (
			
 
				+    get_modules_json,
			
 
				+    get_modules_for_class_json,
			
 
				+    deduplicate_modules,
			
 
				+)
			
 
				 from .models import ClassPdfExtractionPageData, ClassJsonModule
			
--- a/parse/parse_modules.py
+++ b/parse/parse_modules.py
@@ -16,13 +16,16 @@ from .models import (
 
				 
			
 
				 
			
 
				 def get_modules_for_class_json(
			
 
				-    modules: list[RawExtractedModule], class_name: str, degree_program: DegreeProgram
			
 
				+    modules: list[RawExtractedModule],
			
 
				+    class_name: str,
			
 
				+    degree_program: DegreeProgram,
			
 
				+    valid_lecturer_shorthands: list[str] | None = None,
			
 
				 ) -> list[ClassJsonModule]:
			
 
				     output_modules: list[ClassJsonModule] = []
			
 
				 
			
 
				     for input_module in modules:
			
 
				         parsed_data: ParsedModuleCellTextData = parse_module_cell_text(
			
 
				-            input_module.text, class_name, degree_program
			
 
				+            input_module.text, class_name, degree_program, valid_lecturer_shorthands
			
 
				         )
			
 
				 
			
 
				         output_modules.append(
			
@@ -51,6 +54,20 @@ def get_modules_for_class_json(
 
				     return output_modules
			
 
				 
			
 
				 
			
 
				+def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
			
 
				+    """de-duplicate modules based on their id field"""
			
 
				+    unique_modules_map: dict[str, ClassJsonModule] = {}
			
 
				+    for module in modules:
			
 
				+        if module.id in unique_modules_map:
			
 
				+            existing_module = unique_modules_map[module.id]
			
 
				+            existing_module.pages = sorted(
			
 
				+                list(set(existing_module.pages + module.pages))
			
 
				+            )
			
 
				+        else:
			
 
				+            unique_modules_map[module.id] = module
			
 
				+    return list(unique_modules_map.values())
			
 
				+
			
 
				+
			
 
				 def get_modules_json(modules: List[ClassJsonModule]) -> str:
			
 
				     """
			
 
				     Serializes a list of ClassJsonModule objects into a formatted JSON string.
			
@@ -72,7 +89,10 @@ def parse_mixed_degree_programs(
 
				 
			
 
				 
			
 
				 def parse_module_cell_text(
			
 
				-    text: str, class_name: str, degree_program: DegreeProgram
			
 
				+    text: str,
			
 
				+    class_name: str,
			
 
				+    degree_program: DegreeProgram,
			
 
				+    valid_lecturer_shorthands: list[str] | None = None,
			
 
				 ) -> ParsedModuleCellTextData:
			
 
				     lines = text.split("\n")
			
 
				     logging.debug("Parsing module cell text: \n%s", text)
			
@@ -94,16 +114,29 @@ def parse_module_cell_text(
 
				         rooms=rooms,
			
 
				         part_of_other_classes=[],
			
 
				         teaching_type=teaching_type,
			
 
				-        lecturer_shortnames=get_lecturer_shortnames(lines[1]),
			
 
				+        lecturer_shortnames=get_lecturer_shortnames(
			
 
				+            lines[1], valid_lecturer_shorthands
			
 
				+        ),
			
 
				     )
			
 
				 
			
 
				 
			
 
				-def get_lecturer_shortnames(second_line: str) -> list[str]:
			
 
				+def get_lecturer_shortnames(
			
 
				+    second_line: str, valid_lecturer_shorthands: list[str] | None = None
			
 
				+) -> list[str]:
			
 
				     lecturer_shorthands: list[str] = []
			
 
				     words = second_line.split(" ")
			
 
				-    for word in words:
			
 
				-        if len(word) == LECTURER_SHORTHAND_SIZE:
			
 
				-            lecturer_shorthands.append(word)
			
 
				+    if valid_lecturer_shorthands is None:
			
 
				+        for word in words:
			
 
				+            if len(word) == LECTURER_SHORTHAND_SIZE:
			
 
				+                lecturer_shorthands.append(word)
			
 
				+    else:
			
 
				+        for word in words:
			
 
				+            if word in valid_lecturer_shorthands or (
			
 
				+                len(word) == LECTURER_SHORTHAND_SIZE and shorthand.startswith(word)
			
 
				+                for shorthand in valid_lecturer_shorthands
			
 
				+            ):
			
 
				+                lecturer_shorthands.append(word)
			
 
				+
			
 
				     return lecturer_shorthands
			
 
				 
			
 
				 
			
--- a/parse/table_extraction.py
+++ b/parse/table_extraction.py
@@ -104,7 +104,9 @@ def get_modules_from_weekday(
 
				     return modules
			
 
				 
			
 
				 
			
 
				-def extract_data_from_class_pdf(input_filename: str) -> list[ClassPdfExtractionPageData]:
			
 
				+def extract_data_from_class_pdf(
			
 
				+    input_filename: str, lecturers_file=None
			
 
				+) -> list[ClassPdfExtractionPageData]:
			
 
				     """
			
 
				     Extracts all data from class timetable pdf's
			
 
				     """
			
--- a/parse_class_pdf.py
+++ b/parse_class_pdf.py
@@ -1,11 +1,14 @@
 
				 #!/usr/bin/env python3
			
 
				 
			
 
				 import logging
			
 
				+import argparse
			
 
				+import json
			
 
				 
			
 
				 from parse import (
			
 
				     extract_data_from_class_pdf,
			
 
				     get_modules_for_class_json,
			
 
				     get_modules_json,
			
 
				+    deduplicate_modules,
			
 
				     ClassPdfExtractionPageData,
			
 
				     ClassJsonModule,
			
 
				 )
			
@@ -13,10 +16,58 @@ from parse import (
 
				 from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
			
 
				 
			
 
				 
			
 
				+def get_valid_lecturers(file_path: str) -> list[str]:
			
 
				+    """
			
 
				+    Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
			
 
				+    """
			
 
				+    valid_lecturers: list[str] = []
			
 
				+    try:
			
 
				+        with open(file_path, "r", encoding="utf-8") as f:
			
 
				+            data = json.load(f)
			
 
				+            if isinstance(data, list):
			
 
				+                for entry in data:
			
 
				+                    if isinstance(entry, dict) and "short" in entry:
			
 
				+                        valid_lecturers.append(entry["short"])
			
 
				+        logging.info(
			
 
				+            "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        logging.error("Failed to load valid lecturers from '%s': %s", file_path, e)
			
 
				+    return valid_lecturers
			
 
				+
			
 
				+
			
 
				 def main() -> None:
			
 
				+    parser = argparse.ArgumentParser(description="Parse class PDF to JSON.")
			
 
				+    parser.add_argument(
			
 
				+        "-l", "--lecturers", help="Path to the lecturers.json file", default=None
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-i", "--input", help="Path to the input PDF file", default=CLASS_PDF_INPUT_FILE
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-o",
			
 
				+        "--output",
			
 
				+        help="Path to the output JSON file",
			
 
				+        default=CLASSES_JSON_OUTPUT_FILE,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "lecturers_pos",
			
 
				+        nargs="?",
			
 
				+        help="Path to the lecturers.json file (positional)",
			
 
				+        default=None,
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    lecturers_file = args.lecturers or args.lecturers_pos
			
 
				+
			
 
				     logging.basicConfig(level=logging.DEBUG)
			
 
				+
			
 
				+    valid_lecturer_shorthands: list[str] | None = None
			
 
				+    if lecturers_file:
			
 
				+        valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
			
 
				+
			
 
				     extraction_data: list[ClassPdfExtractionPageData] = extract_data_from_class_pdf(
			
 
				-        CLASS_PDF_INPUT_FILE
			
 
				+        args.input
			
 
				     )
			
 
				     parsed_modules: list[ClassJsonModule] = [
			
 
				         module
			
@@ -25,12 +76,14 @@ def main() -> None:
 
				             data.raw_extracted_modules,
			
 
				             data.page_metadata.class_name,
			
 
				             data.page_metadata.degree_program,
			
 
				+            valid_lecturer_shorthands,
			
 
				         )
			
 
				     ]
			
 
				-    json: str = get_modules_json(parsed_modules)
			
 
				+    parsed_modules = deduplicate_modules(parsed_modules)
			
 
				+    json_output: str = get_modules_json(parsed_modules)
			
 
				 
			
 
				-    with open(CLASSES_JSON_OUTPUT_FILE, "w", encoding="utf-8") as f:
			
 
				-        f.write(json)
			
 
				+    with open(args.output, "w", encoding="utf-8") as f:
			
 
				+        f.write(json_output)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":