Prechádzať zdrojové kódy

validate lecturer shorthands via provided lecturers.json + add argparse + de-dupe modules (fixes #1)

Noah Vogt 1 mesiac pred
rodič
commit
72fcd39e7e
5 zmenil súbory, kde vykonal 119 pridanie a 15 odobranie
  1. 13 1
      README.md
  2. 5 1
      parse/__init__.py
  3. 41 8
      parse/parse_modules.py
  4. 3 1
      parse/table_extraction.py
  5. 57 4
      parse_class_pdf.py

+ 13 - 1
README.md

@@ -6,7 +6,19 @@ Because the original repo only contains frontend code and data updates were slow
 
 ## Basic Usage
 
-After installing the [python3](https://www.python.org/) dependencies in `requirements.txt`, execute `parse_class_pdf.py` to parse a class timetable pdf. It reads from the filename constant `CLASS_PDF_INPUT_FILE` defined in `config/constants.py` and outputs in `CLASSES_JSON_OUTPUT_FILE`. By default, these files are `klassen.pdf` and `classes.json`.
+After installing the [python3](https://www.python.org/) dependencies in `requirements.txt`, execute `parse_class_pdf.py` to parse a class timetable PDF.
+
+```bash
+./parse_class_pdf.py [-h] [-l LECTURERS] [-i INPUT] [-o OUTPUT] [lecturers_pos]
+```
+
+### Arguments
+
+- `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
+- `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
+- `-l`, `--lecturers` or `lecturers_pos`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
+
+The default values for input and output files are defined in `config/constants.py`.
 
 ## Project Roadmap
 

+ 5 - 1
parse/__init__.py

@@ -1,3 +1,7 @@
 from .table_extraction import extract_data_from_class_pdf
-from .parse_modules import get_modules_json, get_modules_for_class_json
+from .parse_modules import (
+    get_modules_json,
+    get_modules_for_class_json,
+    deduplicate_modules,
+)
 from .models import ClassPdfExtractionPageData, ClassJsonModule

+ 41 - 8
parse/parse_modules.py

@@ -16,13 +16,16 @@ from .models import (
 
 
 def get_modules_for_class_json(
-    modules: list[RawExtractedModule], class_name: str, degree_program: DegreeProgram
+    modules: list[RawExtractedModule],
+    class_name: str,
+    degree_program: DegreeProgram,
+    valid_lecturer_shorthands: list[str] | None = None,
 ) -> list[ClassJsonModule]:
     output_modules: list[ClassJsonModule] = []
 
     for input_module in modules:
         parsed_data: ParsedModuleCellTextData = parse_module_cell_text(
-            input_module.text, class_name, degree_program
+            input_module.text, class_name, degree_program, valid_lecturer_shorthands
         )
 
         output_modules.append(
@@ -51,6 +54,20 @@ def get_modules_for_class_json(
     return output_modules
 
 
+def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
+    """de-duplicate modules based on their id field"""
+    unique_modules_map: dict[str, ClassJsonModule] = {}
+    for module in modules:
+        if module.id in unique_modules_map:
+            existing_module = unique_modules_map[module.id]
+            existing_module.pages = sorted(
+                list(set(existing_module.pages + module.pages))
+            )
+        else:
+            unique_modules_map[module.id] = module
+    return list(unique_modules_map.values())
+
+
 def get_modules_json(modules: List[ClassJsonModule]) -> str:
     """
     Serializes a list of ClassJsonModule objects into a formatted JSON string.
@@ -72,7 +89,10 @@ def parse_mixed_degree_programs(
 
 
 def parse_module_cell_text(
-    text: str, class_name: str, degree_program: DegreeProgram
+    text: str,
+    class_name: str,
+    degree_program: DegreeProgram,
+    valid_lecturer_shorthands: list[str] | None = None,
 ) -> ParsedModuleCellTextData:
     lines = text.split("\n")
     logging.debug("Parsing module cell text: \n%s", text)
@@ -94,16 +114,29 @@ def parse_module_cell_text(
         rooms=rooms,
         part_of_other_classes=[],
         teaching_type=teaching_type,
-        lecturer_shortnames=get_lecturer_shortnames(lines[1]),
+        lecturer_shortnames=get_lecturer_shortnames(
+            lines[1], valid_lecturer_shorthands
+        ),
     )
 
 
-def get_lecturer_shortnames(second_line: str) -> list[str]:
+def get_lecturer_shortnames(
+    second_line: str, valid_lecturer_shorthands: list[str] | None = None
+) -> list[str]:
     lecturer_shorthands: list[str] = []
     words = second_line.split(" ")
-    for word in words:
-        if len(word) == LECTURER_SHORTHAND_SIZE:
-            lecturer_shorthands.append(word)
+    if valid_lecturer_shorthands is None:
+        for word in words:
+            if len(word) == LECTURER_SHORTHAND_SIZE:
+                lecturer_shorthands.append(word)
+    else:
+        for word in words:
+            if word in valid_lecturer_shorthands or (
+                len(word) == LECTURER_SHORTHAND_SIZE and shorthand.startswith(word)
+                for shorthand in valid_lecturer_shorthands
+            ):
+                lecturer_shorthands.append(word)
+
     return lecturer_shorthands
 
 

+ 3 - 1
parse/table_extraction.py

@@ -104,7 +104,9 @@ def get_modules_from_weekday(
     return modules
 
 
-def extract_data_from_class_pdf(input_filename: str) -> list[ClassPdfExtractionPageData]:
+def extract_data_from_class_pdf(
+    input_filename: str, lecturers_file=None
+) -> list[ClassPdfExtractionPageData]:
     """
     Extracts all data from class timetable pdf's
     """

+ 57 - 4
parse_class_pdf.py

@@ -1,11 +1,14 @@
 #!/usr/bin/env python3
 
 import logging
+import argparse
+import json
 
 from parse import (
     extract_data_from_class_pdf,
     get_modules_for_class_json,
     get_modules_json,
+    deduplicate_modules,
     ClassPdfExtractionPageData,
     ClassJsonModule,
 )
@@ -13,10 +16,58 @@ from parse import (
 from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
 
 
+def get_valid_lecturers(file_path: str) -> list[str]:
+    """
+    Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
+    """
+    valid_lecturers: list[str] = []
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            if isinstance(data, list):
+                for entry in data:
+                    if isinstance(entry, dict) and "short" in entry:
+                        valid_lecturers.append(entry["short"])
+        logging.info(
+            "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
+        )
+    except Exception as e:
+        logging.error("Failed to load valid lecturers from '%s': %s", file_path, e)
+    return valid_lecturers
+
+
 def main() -> None:
+    parser = argparse.ArgumentParser(description="Parse class PDF to JSON.")
+    parser.add_argument(
+        "-l", "--lecturers", help="Path to the lecturers.json file", default=None
+    )
+    parser.add_argument(
+        "-i", "--input", help="Path to the input PDF file", default=CLASS_PDF_INPUT_FILE
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Path to the output JSON file",
+        default=CLASSES_JSON_OUTPUT_FILE,
+    )
+    parser.add_argument(
+        "lecturers_pos",
+        nargs="?",
+        help="Path to the lecturers.json file (positional)",
+        default=None,
+    )
+
+    args = parser.parse_args()
+    lecturers_file = args.lecturers or args.lecturers_pos
+
     logging.basicConfig(level=logging.DEBUG)
+
+    valid_lecturer_shorthands: list[str] | None = None
+    if lecturers_file:
+        valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
+
     extraction_data: list[ClassPdfExtractionPageData] = extract_data_from_class_pdf(
-        CLASS_PDF_INPUT_FILE
+        args.input
     )
     parsed_modules: list[ClassJsonModule] = [
         module
@@ -25,12 +76,14 @@ def main() -> None:
             data.raw_extracted_modules,
             data.page_metadata.class_name,
             data.page_metadata.degree_program,
+            valid_lecturer_shorthands,
         )
     ]
-    json: str = get_modules_json(parsed_modules)
+    parsed_modules = deduplicate_modules(parsed_modules)
+    json_output: str = get_modules_json(parsed_modules)
 
-    with open(CLASSES_JSON_OUTPUT_FILE, "w", encoding="utf-8") as f:
-        f.write(json)
+    with open(args.output, "w", encoding="utf-8") as f:
+        f.write(json_output)
 
 
 if __name__ == "__main__":