3 месяцев назад · d743e16db0
--- a/README.md
+++ b/README.md
@@ -33,8 +33,8 @@ Execute the following to parse a class timetable PDF into the `classes.json` fil
 
															 - `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
														
 
															 - `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
														
 
															 - `-l`, `--lecturers`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
														
 
															-- `--save-intermediate`: Path to save the intermediate extraction data (pickle format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
														
 
															-- `--load-intermediate`: Path to load the intermediate extraction data from (pickle format) and skip extraction.
														
 
															+- `--save-intermediate`: Path to save the intermediate extraction data (JSON format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
														
 
															+- `--load-intermediate`: Path to load the intermediate extraction data from (JSON format) and skip extraction.
														
 
															 The default values for input and output files are defined in `config/constants.py`.
														
--- a/parse/above_table_text.py
+++ b/parse/above_table_text.py
@@ -20,14 +20,21 @@ def parse_above_table_text(
 
															         raise RuntimeError("Invalid Number of Lines.")
														
 
															     semester_type: SemesterType = get_semester_value(lines[0])
														
 
															-    semester: Semester = Semester(get_semester_year(lines[0]), semester_type)
														
 
															+    semester: Semester = Semester(
														
 
															+        yyyy=get_semester_year(lines[0]), semester_type=semester_type
														
 
															+    )
														
 
															     class_name: str = get_class_name(lines[2])
														
 
															     degree_program: DegreeProgram = get_degree_program(
														
 
															         lines[2], class_name, previous_page_metadata
														
 
															     )
														
 
															     export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
														
 
															-    return PageMetadata(semester, export_timestamp, class_name, degree_program)
														
 
															+    return PageMetadata(
														
 
															+        semester=semester,
														
 
															+        export_timestamp=export_timestamp,
														
 
															+        class_name=class_name,
														
 
															+        degree_program=degree_program,
														
 
															+    )
														
 
															 def get_export_timestamp(second_line: str) -> ExportTimestamp:
														
@@ -51,7 +58,10 @@ def get_export_timestamp(second_line: str) -> ExportTimestamp:
 
															     else:
														
 
															         raise RuntimeError("No Time found")
														
 
															-    return ExportTimestamp(Date(date_yyyy, date_mm, date_dd), Time(time_hh, time_mm))
														
 
															+    return ExportTimestamp(
														
 
															+        date=Date(yyyy=date_yyyy, mm=date_mm, dd=date_dd),
														
 
															+        time=Time(hh=time_hh, mm=time_mm),
														
 
															+    )
														
 
															 def get_class_name(third_line: str) -> str:
														
--- a/parse/geometry.py
+++ b/parse/geometry.py
@@ -27,7 +27,9 @@ def is_line_at_bottom(
 
															     if not is_vertical_match(line.y, area.y2, tolerance):
														
 
															         return False
														
 
															-    return has_horizontal_overlap(XLevel(line.x1, line.x2), XLevel(area.x1, area.x2))
														
 
															+    return has_horizontal_overlap(
														
 
															+        XLevel(x1=line.x1, x2=line.x2), XLevel(x1=area.x1, x2=area.x2)
														
 
															+    )
														
 
															 def is_area_below(area1: Area, area2: Area, tolerance: float = TOLERANCE) -> bool:
														
@@ -39,7 +41,7 @@ def is_area_below(area1: Area, area2: Area, tolerance: float = TOLERANCE) -> boo
 
															         return False
														
 
															     return has_horizontal_overlap(
														
 
															-        XLevel(area1.x1, area1.x2), XLevel(area2.x1, area2.x2)
														
 
															+        XLevel(x1=area1.x1, x2=area1.x2), XLevel(x1=area2.x1, x2=area2.x2)
														
 
															     )
														
--- a/parse/models.py
+++ b/parse/models.py
@@ -1,31 +1,26 @@
 
															 from enum import Enum, unique
														
 
															-from dataclasses import dataclass
														
 
															-from typing import Annotated
														
 
															+from typing import Annotated, Any
														
 
															-from pydantic import BaseModel, PlainSerializer, Field, ConfigDict
														
 
															+from pydantic import BaseModel, PlainSerializer, Field, ConfigDict, BeforeValidator
														
 
															-@dataclass
														
 
															-class XLevel:
														
 
															+class XLevel(BaseModel):
														
 
															     x1: float
														
 
															     x2: float
														
 
															-@dataclass
														
 
															-class YLevel:
														
 
															+class YLevel(BaseModel):
														
 
															     y1: float
														
 
															     y2: float
														
 
															-@dataclass
														
 
															-class HorizontalLine:
														
 
															+class HorizontalLine(BaseModel):
														
 
															     x1: float
														
 
															     x2: float
														
 
															     y: float
														
 
															-@dataclass
														
 
															-class Area:
														
 
															+class Area(BaseModel):
														
 
															     x1: float
														
 
															     y1: float
														
 
															     x2: float
														
@@ -53,27 +48,24 @@ class SemesterType(Enum):
 
															     FS = "Frühlingssemester"
														
 
															-@dataclass(frozen=True)
														
 
															-class Semester:
														
 
															+class Semester(BaseModel):
														
 
															+    model_config = ConfigDict(frozen=True)
														
 
															     yyyy: int
														
 
															     semester_type: SemesterType
														
 
															-@dataclass
														
 
															-class Date:
														
 
															+class Date(BaseModel):
														
 
															     yyyy: int
														
 
															     mm: int
														
 
															     dd: int
														
 
															-@dataclass
														
 
															-class Time:
														
 
															+class Time(BaseModel):
														
 
															     hh: int
														
 
															     mm: int
														
 
															-@dataclass
														
 
															-class ExportTimestamp:
														
 
															+class ExportTimestamp(BaseModel):
														
 
															     date: Date
														
 
															     time: Time
														
@@ -96,22 +88,20 @@ class DegreeProgram(Enum):
 
															     AGNOSTIC = "SG-???"
														
 
															-@dataclass
														
 
															-class PageMetadata:
														
 
															+class PageMetadata(BaseModel):
														
 
															     semester: Semester
														
 
															     export_timestamp: ExportTimestamp
														
 
															     class_name: str
														
 
															     degree_program: DegreeProgram
														
 
															-@dataclass
														
 
															-class UnmergedTimeEntries:
														
 
															+class UnmergedTimeEntries(BaseModel):
														
 
															     cells: list[Area]
														
 
															     horizontal_lines: list[HorizontalLine]
														
 
															-@dataclass(frozen=True)
														
 
															-class TimeSlot:
														
 
															+class TimeSlot(BaseModel):
														
 
															+    model_config = ConfigDict(frozen=True)
														
 
															     start_time: str
														
 
															     end_time: str
														
@@ -124,9 +114,18 @@ class TimeSlot:
 
															         return hours * 3600 + minutes * 60
														
 
															-@dataclass
														
 
															-class RawExtractedModule:
														
 
															-    weekday: Weekday
														
 
															+def to_tuple_if_list(v: Any) -> Any:
														
 
															+    if isinstance(v, list):
														
 
															+        return tuple(v)
														
 
															+    return v
														
 
															+
														
 
															+
														
 
															+# needed for pydantic to correctly parse the custom Weekday Enum
														
 
															+TolerantWeekday = Annotated[Weekday, BeforeValidator(to_tuple_if_list)]
														
 
															+
														
 
															+
														
 
															+class RawExtractedModule(BaseModel):
														
 
															+    weekday: TolerantWeekday
														
 
															     start_seconds: int
														
 
															     end_seconds: int
														
 
															     text: str
														
@@ -141,8 +140,7 @@ class TeachingType(Enum):
 
															     BLOCK = "blockmodule"
														
 
															-@dataclass
														
 
															-class RawLecturer:
														
 
															+class RawLecturer(BaseModel):
														
 
															     """
														
 
															     Basic representation of an extracted lecturer from a pdf that needs to be parsed.
														
 
															     """
														
@@ -166,8 +164,7 @@ class Lecturer(BaseModel):
 
															 CustomWeekday = Annotated[Weekday, PlainSerializer(lambda v: v.index, return_type=int)]
														
 
															-@dataclass
														
 
															-class ParsedModuleCellTextData:
														
 
															+class ParsedModuleCellTextData(BaseModel):
														
 
															     module_shorthand: str
														
 
															     degree_program: DegreeProgram
														
 
															     class_name: str
														
@@ -193,13 +190,11 @@ class ClassJsonModule(BaseModel):
 
															     lecturer_shorthands: list[str] = Field(..., alias="teachers")
														
 
															-@dataclass
														
 
															-class ClassPdfExtractionPageData:
														
 
															+class ClassPdfExtractionPageData(BaseModel):
														
 
															     raw_extracted_modules: list[RawExtractedModule]
														
 
															     page_metadata: PageMetadata
														
 
															-@dataclass
														
 
															-class StartsWithMatch:
														
 
															+class StartsWithMatch(BaseModel):
														
 
															     shorthand_found: str
														
 
															     num_of_matches: int
														
--- a/parse/parse_modules.py
+++ b/parse/parse_modules.py
@@ -185,7 +185,9 @@ def matches_startswith(
 
															         if shorthand.startswith(word):
														
 
															             shorthand_with_start = shorthand
														
 
															             num_of_startwith_matches += 1
														
 
															-    return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
														
 
															+    return StartsWithMatch(
														
 
															+        shorthand_found=shorthand_with_start, num_of_matches=num_of_startwith_matches
														
 
															+    )
														
 
															 def get_module_shorthand(
														
--- a/parse/table_extraction.py
+++ b/parse/table_extraction.py
@@ -24,7 +24,8 @@ from .geometry import (
 
															 from .img import is_mostly_white_area
														
 
															 allowed_time_slots: list[TimeSlot] = [
														
 
															-    TimeSlot(*timeslot_tuple) for timeslot_tuple in ALLOWED_TIMESLOTS
														
 
															+    TimeSlot(start_time=timeslot_tuple[0], end_time=timeslot_tuple[1])
														
 
															+    for timeslot_tuple in ALLOWED_TIMESLOTS
														
 
															 ]
														
@@ -101,7 +102,9 @@ def get_modules_from_weekday(
 
															                     found_matching_next_cell_index
														
 
															                 )
														
 
															                 logging.debug("matched cell area: %s", matched_area)
														
 
															-                area = Area(area.x1, area.y1, matched_area.x2, matched_area.y2)
														
 
															+                area = Area(
														
 
															+                    x1=area.x1, y1=area.y1, x2=matched_area.x2, y2=matched_area.y2
														
 
															+                )
														
 
															         text = page.crop((area.x1, area.y1, area.x2, area.y2)).extract_text()
														
 
															         timeslot = get_timeslot_for_area(area, timeslot_y_levels)
														
@@ -109,7 +112,13 @@ def get_modules_from_weekday(
 
															             raise RuntimeError("Could not match TimeSlot to Cell Area")
														
 
															         end_seconds = timeslot.end_seconds()
														
 
															         modules.append(
														
 
															-            RawExtractedModule(weekday, start_seconds, end_seconds, text, page_number)
														
 
															+            RawExtractedModule(
														
 
															+                weekday=weekday,
														
 
															+                start_seconds=start_seconds,
														
 
															+                end_seconds=end_seconds,
														
 
															+                text=text,
														
 
															+                source_page_number=page_number,
														
 
															+            )
														
 
															         )
														
 
															     return modules
														
@@ -128,7 +137,7 @@ def extract_data_from_class_pdf(
 
															             weekday_areas: dict[Weekday, Area] = {}
														
 
															             timeslot_y_levels: dict[TimeSlot, YLevel] = {}
														
 
															             for day in Weekday:
														
 
															-                weekday_areas[day] = Area(0, 0, 0, 0)
														
 
															+                weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
														
 
															             found_tables = page.find_tables(CLASS_PDF_TABLE_SETTINGS)
														
 
															             logging.debug(
														
@@ -179,7 +188,7 @@ def extract_data_from_class_pdf(
 
															                             if weekday_enum:
														
 
															                                 logging.debug("Weekday %s found", cell_text)
														
 
															                                 weekday_areas[weekday_enum] = Area(
														
 
															-                                    cell[0], cell[3], cell[2], 0
														
 
															+                                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
														
 
															                                 )
														
 
															                 else:
														
 
															                     logging.debug("row: %d, col: %d", row_index, 0)
														
@@ -200,22 +209,26 @@ def extract_data_from_class_pdf(
 
															                             )
														
 
															                         else:
														
 
															                             # assumes this is the last timeslot ever
														
 
															-                            if target_timeslot == TimeSlot("20:30", "21:15"):
														
 
															+                            if target_timeslot == TimeSlot(
														
 
															+                                start_time="20:30", end_time="21:15"
														
 
															+                            ):
														
 
															                                 for weekday in Weekday:
														
 
															                                     new_area = Area(
														
 
															-                                        weekday_areas[weekday].x1,
														
 
															-                                        weekday_areas[weekday].y1,
														
 
															-                                        weekday_areas[weekday].x2,
														
 
															-                                        cell[3],
														
 
															+                                        x1=weekday_areas[weekday].x1,
														
 
															+                                        y1=weekday_areas[weekday].y1,
														
 
															+                                        x2=weekday_areas[weekday].x2,
														
 
															+                                        y2=cell[3],
														
 
															                                     )
														
 
															                                     weekday_areas[weekday] = new_area
														
 
															                             timeslot_y_levels[target_timeslot] = YLevel(
														
 
															-                                cell[1], cell[3]
														
 
															+                                y1=cell[1], y2=cell[3]
														
 
															                             )
														
 
															                             expected_timeslot_index += 1
														
 
															             for weekday in Weekday:
														
 
															-                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries([], [])
														
 
															+                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
														
 
															+                    cells=[], horizontal_lines=[]
														
 
															+                )
														
 
															                 target_area = weekday_areas[weekday]
														
 
															                 logging.debug("target_area: %s", target_area)
														
@@ -234,7 +247,9 @@ def extract_data_from_class_pdf(
 
															                         ):
														
 
															                             cell_dimensions = cell[0], cell[1], cell[2], cell[3]
														
 
															                             unmerged_time_entries_by_weekday[weekday].cells.append(
														
 
															-                                Area(*cell_dimensions)
														
 
															+                                Area(
														
 
															+                                    x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3]
														
 
															+                                )
														
 
															                             )
														
 
															                             logging.debug("%s cell found", weekday)
														
@@ -254,7 +269,7 @@ def extract_data_from_class_pdf(
 
															                         unmerged_time_entries_by_weekday[
														
 
															                             weekday
														
 
															                         ].horizontal_lines.append(
														
 
															-                            HorizontalLine(line_x1, line_x2, line_bottom)
														
 
															+                            HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
														
 
															                         )
														
 
															             all_modules: list[RawExtractedModule] = []
														
@@ -273,7 +288,9 @@ def extract_data_from_class_pdf(
 
															             )
														
 
															             previous_page_metadata.append(page_metadata)
														
 
															             extraction_data.append(
														
 
															-                ClassPdfExtractionPageData(all_modules, page_metadata)
														
 
															+                ClassPdfExtractionPageData(
														
 
															+                    raw_extracted_modules=all_modules, page_metadata=page_metadata
														
 
															+                )
														
 
															             )
														
 
															         return extraction_data
														
--- a/parse_class_pdf.py
+++ b/parse_class_pdf.py
@@ -2,8 +2,8 @@
 
															 import logging
														
 
															 from argparse import ArgumentParser
														
 
															-import pickle
														
 
															 import json
														
 
															+from pydantic import TypeAdapter
														
 
															 from parse import (
														
 
															     extract_data_from_class_pdf,
														
@@ -24,7 +24,7 @@ def get_valid_lecturers(file_path: str) -> list[str]:
 
															     """
														
 
															     valid_lecturers: list[str] = []
														
 
															     try:
														
 
															-        print(f"READING: '{file_path}'")
														
 
															+        logging.warning("reading lecturers file: '%s'", file_path)
														
 
															         with open(file_path, "r", encoding="utf-8") as f:
														
 
															             data = json.load(f)
														
 
															             if isinstance(data, list):
														
@@ -55,19 +55,19 @@ def main() -> None:
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--save-intermediate",
														
 
															-        help="Path to save the intermediate extraction data (pickle format) and exit",
														
 
															+        help="Path to save the intermediate extraction data (JSON format) and exit",
														
 
															         default=None,
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--load-intermediate",
														
 
															-        help="Path to load the intermediate extraction data from (pickle format) and skip extraction",
														
 
															+        help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
														
 
															         default=None,
														
 
															     )
														
 
															     args = parser.parse_args()
														
 
															     lecturers_file = args.lecturers
														
 
															-    logging.basicConfig(level=logging.DEBUG)
														
 
															+    logging.basicConfig(level=logging.INFO)
														
 
															     valid_lecturer_shorthands: list[str] | None = None
														
 
															     if lecturers_file:
														
@@ -77,14 +77,20 @@ def main() -> None:
 
															     if args.load_intermediate:
														
 
															         logging.info("Loading intermediate data from %s", args.load_intermediate)
														
 
															-        with open(args.load_intermediate, "rb") as f:
														
 
															-            extraction_data = pickle.load(f)
														
 
															+        with open(args.load_intermediate, "r", encoding="utf-8") as f:
														
 
															+            extraction_data = TypeAdapter(
														
 
															+                list[ClassPdfExtractionPageData]
														
 
															+            ).validate_json(f.read())
														
 
															     else:
														
 
															         extraction_data = extract_data_from_class_pdf(args.input)
														
 
															         if args.save_intermediate:
														
 
															             logging.info("Saving intermediate data to %s", args.save_intermediate)
														
 
															-            with open(args.save_intermediate, "wb") as f:
														
 
															-                pickle.dump(extraction_data, f)
														
 
															+            with open(args.save_intermediate, "w", encoding="utf-8") as f:
														
 
															+                f.write(
														
 
															+                    TypeAdapter(list[ClassPdfExtractionPageData])
														
 
															+                    .dump_json(extraction_data)
														
 
															+                    .decode("utf-8")
														
 
															+                )
														
 
															             return
														
 
															     parsed_modules: list[ClassJsonModule] = [