3 ماه پیش · d743e16db0
--- a/README.md
+++ b/README.md
@@ -33,8 +33,8 @@ Execute the following to parse a class timetable PDF into the `classes.json` fil
 
				 - `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
			
 
				 - `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
			
 
				 - `-l`, `--lecturers`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
			
 
				-- `--save-intermediate`: Path to save the intermediate extraction data (pickle format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
			
 
				-- `--load-intermediate`: Path to load the intermediate extraction data from (pickle format) and skip extraction.
			
 
				+- `--save-intermediate`: Path to save the intermediate extraction data (JSON format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
			
 
				+- `--load-intermediate`: Path to load the intermediate extraction data from (JSON format) and skip extraction.
			
 
				 
			
 
				 The default values for input and output files are defined in `config/constants.py`.
			
 
				 
			
--- a/parse/above_table_text.py
+++ b/parse/above_table_text.py
@@ -20,14 +20,21 @@ def parse_above_table_text(
 
				         raise RuntimeError("Invalid Number of Lines.")
			
 
				 
			
 
				     semester_type: SemesterType = get_semester_value(lines[0])
			
 
				-    semester: Semester = Semester(get_semester_year(lines[0]), semester_type)
			
 
				+    semester: Semester = Semester(
			
 
				+        yyyy=get_semester_year(lines[0]), semester_type=semester_type
			
 
				+    )
			
 
				     class_name: str = get_class_name(lines[2])
			
 
				     degree_program: DegreeProgram = get_degree_program(
			
 
				         lines[2], class_name, previous_page_metadata
			
 
				     )
			
 
				     export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
			
 
				 
			
 
				-    return PageMetadata(semester, export_timestamp, class_name, degree_program)
			
 
				+    return PageMetadata(
			
 
				+        semester=semester,
			
 
				+        export_timestamp=export_timestamp,
			
 
				+        class_name=class_name,
			
 
				+        degree_program=degree_program,
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 def get_export_timestamp(second_line: str) -> ExportTimestamp:
			
@@ -51,7 +58,10 @@ def get_export_timestamp(second_line: str) -> ExportTimestamp:
 
				     else:
			
 
				         raise RuntimeError("No Time found")
			
 
				 
			
 
				-    return ExportTimestamp(Date(date_yyyy, date_mm, date_dd), Time(time_hh, time_mm))
			
 
				+    return ExportTimestamp(
			
 
				+        date=Date(yyyy=date_yyyy, mm=date_mm, dd=date_dd),
			
 
				+        time=Time(hh=time_hh, mm=time_mm),
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 def get_class_name(third_line: str) -> str:
			
--- a/parse/geometry.py
+++ b/parse/geometry.py
@@ -27,7 +27,9 @@ def is_line_at_bottom(
 
				     if not is_vertical_match(line.y, area.y2, tolerance):
			
 
				         return False
			
 
				 
			
 
				-    return has_horizontal_overlap(XLevel(line.x1, line.x2), XLevel(area.x1, area.x2))
			
 
				+    return has_horizontal_overlap(
			
 
				+        XLevel(x1=line.x1, x2=line.x2), XLevel(x1=area.x1, x2=area.x2)
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 def is_area_below(area1: Area, area2: Area, tolerance: float = TOLERANCE) -> bool:
			
@@ -39,7 +41,7 @@ def is_area_below(area1: Area, area2: Area, tolerance: float = TOLERANCE) -> boo
 
				         return False
			
 
				 
			
 
				     return has_horizontal_overlap(
			
 
				-        XLevel(area1.x1, area1.x2), XLevel(area2.x1, area2.x2)
			
 
				+        XLevel(x1=area1.x1, x2=area1.x2), XLevel(x1=area2.x1, x2=area2.x2)
			
 
				     )
			
 
				 
			
 
				 
			
--- a/parse/models.py
+++ b/parse/models.py
@@ -1,31 +1,26 @@
 
				 from enum import Enum, unique
			
 
				-from dataclasses import dataclass
			
 
				-from typing import Annotated
			
 
				+from typing import Annotated, Any
			
 
				 
			
 
				-from pydantic import BaseModel, PlainSerializer, Field, ConfigDict
			
 
				+from pydantic import BaseModel, PlainSerializer, Field, ConfigDict, BeforeValidator
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class XLevel:
			
 
				+class XLevel(BaseModel):
			
 
				     x1: float
			
 
				     x2: float
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class YLevel:
			
 
				+class YLevel(BaseModel):
			
 
				     y1: float
			
 
				     y2: float
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class HorizontalLine:
			
 
				+class HorizontalLine(BaseModel):
			
 
				     x1: float
			
 
				     x2: float
			
 
				     y: float
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class Area:
			
 
				+class Area(BaseModel):
			
 
				     x1: float
			
 
				     y1: float
			
 
				     x2: float
			
@@ -53,27 +48,24 @@ class SemesterType(Enum):
 
				     FS = "Frühlingssemester"
			
 
				 
			
 
				 
			
 
				-@dataclass(frozen=True)
			
 
				-class Semester:
			
 
				+class Semester(BaseModel):
			
 
				+    model_config = ConfigDict(frozen=True)
			
 
				     yyyy: int
			
 
				     semester_type: SemesterType
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class Date:
			
 
				+class Date(BaseModel):
			
 
				     yyyy: int
			
 
				     mm: int
			
 
				     dd: int
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class Time:
			
 
				+class Time(BaseModel):
			
 
				     hh: int
			
 
				     mm: int
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class ExportTimestamp:
			
 
				+class ExportTimestamp(BaseModel):
			
 
				     date: Date
			
 
				     time: Time
			
 
				 
			
@@ -96,22 +88,20 @@ class DegreeProgram(Enum):
 
				     AGNOSTIC = "SG-???"
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class PageMetadata:
			
 
				+class PageMetadata(BaseModel):
			
 
				     semester: Semester
			
 
				     export_timestamp: ExportTimestamp
			
 
				     class_name: str
			
 
				     degree_program: DegreeProgram
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class UnmergedTimeEntries:
			
 
				+class UnmergedTimeEntries(BaseModel):
			
 
				     cells: list[Area]
			
 
				     horizontal_lines: list[HorizontalLine]
			
 
				 
			
 
				 
			
 
				-@dataclass(frozen=True)
			
 
				-class TimeSlot:
			
 
				+class TimeSlot(BaseModel):
			
 
				+    model_config = ConfigDict(frozen=True)
			
 
				     start_time: str
			
 
				     end_time: str
			
 
				 
			
@@ -124,9 +114,18 @@ class TimeSlot:
 
				         return hours * 3600 + minutes * 60
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class RawExtractedModule:
			
 
				-    weekday: Weekday
			
 
				+def to_tuple_if_list(v: Any) -> Any:
			
 
				+    if isinstance(v, list):
			
 
				+        return tuple(v)
			
 
				+    return v
			
 
				+
			
 
				+
			
 
				+# needed for pydantic to correctly parse the custom Weekday Enum
			
 
				+TolerantWeekday = Annotated[Weekday, BeforeValidator(to_tuple_if_list)]
			
 
				+
			
 
				+
			
 
				+class RawExtractedModule(BaseModel):
			
 
				+    weekday: TolerantWeekday
			
 
				     start_seconds: int
			
 
				     end_seconds: int
			
 
				     text: str
			
@@ -141,8 +140,7 @@ class TeachingType(Enum):
 
				     BLOCK = "blockmodule"
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class RawLecturer:
			
 
				+class RawLecturer(BaseModel):
			
 
				     """
			
 
				     Basic representation of an extracted lecturer from a pdf that needs to be parsed.
			
 
				     """
			
@@ -166,8 +164,7 @@ class Lecturer(BaseModel):
 
				 CustomWeekday = Annotated[Weekday, PlainSerializer(lambda v: v.index, return_type=int)]
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class ParsedModuleCellTextData:
			
 
				+class ParsedModuleCellTextData(BaseModel):
			
 
				     module_shorthand: str
			
 
				     degree_program: DegreeProgram
			
 
				     class_name: str
			
@@ -193,13 +190,11 @@ class ClassJsonModule(BaseModel):
 
				     lecturer_shorthands: list[str] = Field(..., alias="teachers")
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class ClassPdfExtractionPageData:
			
 
				+class ClassPdfExtractionPageData(BaseModel):
			
 
				     raw_extracted_modules: list[RawExtractedModule]
			
 
				     page_metadata: PageMetadata
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class StartsWithMatch:
			
 
				+class StartsWithMatch(BaseModel):
			
 
				     shorthand_found: str
			
 
				     num_of_matches: int
			
--- a/parse/parse_modules.py
+++ b/parse/parse_modules.py
@@ -185,7 +185,9 @@ def matches_startswith(
 
				         if shorthand.startswith(word):
			
 
				             shorthand_with_start = shorthand
			
 
				             num_of_startwith_matches += 1
			
 
				-    return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
			
 
				+    return StartsWithMatch(
			
 
				+        shorthand_found=shorthand_with_start, num_of_matches=num_of_startwith_matches
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 def get_module_shorthand(
			
--- a/parse/table_extraction.py
+++ b/parse/table_extraction.py
@@ -24,7 +24,8 @@ from .geometry import (
 
				 from .img import is_mostly_white_area
			
 
				 
			
 
				 allowed_time_slots: list[TimeSlot] = [
			
 
				-    TimeSlot(*timeslot_tuple) for timeslot_tuple in ALLOWED_TIMESLOTS
			
 
				+    TimeSlot(start_time=timeslot_tuple[0], end_time=timeslot_tuple[1])
			
 
				+    for timeslot_tuple in ALLOWED_TIMESLOTS
			
 
				 ]
			
 
				 
			
 
				 
			
@@ -101,7 +102,9 @@ def get_modules_from_weekday(
 
				                     found_matching_next_cell_index
			
 
				                 )
			
 
				                 logging.debug("matched cell area: %s", matched_area)
			
 
				-                area = Area(area.x1, area.y1, matched_area.x2, matched_area.y2)
			
 
				+                area = Area(
			
 
				+                    x1=area.x1, y1=area.y1, x2=matched_area.x2, y2=matched_area.y2
			
 
				+                )
			
 
				 
			
 
				         text = page.crop((area.x1, area.y1, area.x2, area.y2)).extract_text()
			
 
				         timeslot = get_timeslot_for_area(area, timeslot_y_levels)
			
@@ -109,7 +112,13 @@ def get_modules_from_weekday(
 
				             raise RuntimeError("Could not match TimeSlot to Cell Area")
			
 
				         end_seconds = timeslot.end_seconds()
			
 
				         modules.append(
			
 
				-            RawExtractedModule(weekday, start_seconds, end_seconds, text, page_number)
			
 
				+            RawExtractedModule(
			
 
				+                weekday=weekday,
			
 
				+                start_seconds=start_seconds,
			
 
				+                end_seconds=end_seconds,
			
 
				+                text=text,
			
 
				+                source_page_number=page_number,
			
 
				+            )
			
 
				         )
			
 
				     return modules
			
 
				 
			
@@ -128,7 +137,7 @@ def extract_data_from_class_pdf(
 
				             weekday_areas: dict[Weekday, Area] = {}
			
 
				             timeslot_y_levels: dict[TimeSlot, YLevel] = {}
			
 
				             for day in Weekday:
			
 
				-                weekday_areas[day] = Area(0, 0, 0, 0)
			
 
				+                weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
			
 
				 
			
 
				             found_tables = page.find_tables(CLASS_PDF_TABLE_SETTINGS)
			
 
				             logging.debug(
			
@@ -179,7 +188,7 @@ def extract_data_from_class_pdf(
 
				                             if weekday_enum:
			
 
				                                 logging.debug("Weekday %s found", cell_text)
			
 
				                                 weekday_areas[weekday_enum] = Area(
			
 
				-                                    cell[0], cell[3], cell[2], 0
			
 
				+                                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
			
 
				                                 )
			
 
				                 else:
			
 
				                     logging.debug("row: %d, col: %d", row_index, 0)
			
@@ -200,22 +209,26 @@ def extract_data_from_class_pdf(
 
				                             )
			
 
				                         else:
			
 
				                             # assumes this is the last timeslot ever
			
 
				-                            if target_timeslot == TimeSlot("20:30", "21:15"):
			
 
				+                            if target_timeslot == TimeSlot(
			
 
				+                                start_time="20:30", end_time="21:15"
			
 
				+                            ):
			
 
				                                 for weekday in Weekday:
			
 
				                                     new_area = Area(
			
 
				-                                        weekday_areas[weekday].x1,
			
 
				-                                        weekday_areas[weekday].y1,
			
 
				-                                        weekday_areas[weekday].x2,
			
 
				-                                        cell[3],
			
 
				+                                        x1=weekday_areas[weekday].x1,
			
 
				+                                        y1=weekday_areas[weekday].y1,
			
 
				+                                        x2=weekday_areas[weekday].x2,
			
 
				+                                        y2=cell[3],
			
 
				                                     )
			
 
				                                     weekday_areas[weekday] = new_area
			
 
				                             timeslot_y_levels[target_timeslot] = YLevel(
			
 
				-                                cell[1], cell[3]
			
 
				+                                y1=cell[1], y2=cell[3]
			
 
				                             )
			
 
				                             expected_timeslot_index += 1
			
 
				 
			
 
				             for weekday in Weekday:
			
 
				-                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries([], [])
			
 
				+                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
			
 
				+                    cells=[], horizontal_lines=[]
			
 
				+                )
			
 
				                 target_area = weekday_areas[weekday]
			
 
				                 logging.debug("target_area: %s", target_area)
			
 
				 
			
@@ -234,7 +247,9 @@ def extract_data_from_class_pdf(
 
				                         ):
			
 
				                             cell_dimensions = cell[0], cell[1], cell[2], cell[3]
			
 
				                             unmerged_time_entries_by_weekday[weekday].cells.append(
			
 
				-                                Area(*cell_dimensions)
			
 
				+                                Area(
			
 
				+                                    x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3]
			
 
				+                                )
			
 
				                             )
			
 
				                             logging.debug("%s cell found", weekday)
			
 
				 
			
@@ -254,7 +269,7 @@ def extract_data_from_class_pdf(
 
				                         unmerged_time_entries_by_weekday[
			
 
				                             weekday
			
 
				                         ].horizontal_lines.append(
			
 
				-                            HorizontalLine(line_x1, line_x2, line_bottom)
			
 
				+                            HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
			
 
				                         )
			
 
				 
			
 
				             all_modules: list[RawExtractedModule] = []
			
@@ -273,7 +288,9 @@ def extract_data_from_class_pdf(
 
				             )
			
 
				             previous_page_metadata.append(page_metadata)
			
 
				             extraction_data.append(
			
 
				-                ClassPdfExtractionPageData(all_modules, page_metadata)
			
 
				+                ClassPdfExtractionPageData(
			
 
				+                    raw_extracted_modules=all_modules, page_metadata=page_metadata
			
 
				+                )
			
 
				             )
			
 
				         return extraction_data
			
 
				 
			
--- a/parse_class_pdf.py
+++ b/parse_class_pdf.py
@@ -2,8 +2,8 @@
 
				 
			
 
				 import logging
			
 
				 from argparse import ArgumentParser
			
 
				-import pickle
			
 
				 import json
			
 
				+from pydantic import TypeAdapter
			
 
				 
			
 
				 from parse import (
			
 
				     extract_data_from_class_pdf,
			
@@ -24,7 +24,7 @@ def get_valid_lecturers(file_path: str) -> list[str]:
 
				     """
			
 
				     valid_lecturers: list[str] = []
			
 
				     try:
			
 
				-        print(f"READING: '{file_path}'")
			
 
				+        logging.warning("reading lecturers file: '%s'", file_path)
			
 
				         with open(file_path, "r", encoding="utf-8") as f:
			
 
				             data = json.load(f)
			
 
				             if isinstance(data, list):
			
@@ -55,19 +55,19 @@ def main() -> None:
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--save-intermediate",
			
 
				-        help="Path to save the intermediate extraction data (pickle format) and exit",
			
 
				+        help="Path to save the intermediate extraction data (JSON format) and exit",
			
 
				         default=None,
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "--load-intermediate",
			
 
				-        help="Path to load the intermediate extraction data from (pickle format) and skip extraction",
			
 
				+        help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
			
 
				         default=None,
			
 
				     )
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				     lecturers_file = args.lecturers
			
 
				 
			
 
				-    logging.basicConfig(level=logging.DEBUG)
			
 
				+    logging.basicConfig(level=logging.INFO)
			
 
				 
			
 
				     valid_lecturer_shorthands: list[str] | None = None
			
 
				     if lecturers_file:
			
@@ -77,14 +77,20 @@ def main() -> None:
 
				 
			
 
				     if args.load_intermediate:
			
 
				         logging.info("Loading intermediate data from %s", args.load_intermediate)
			
 
				-        with open(args.load_intermediate, "rb") as f:
			
 
				-            extraction_data = pickle.load(f)
			
 
				+        with open(args.load_intermediate, "r", encoding="utf-8") as f:
			
 
				+            extraction_data = TypeAdapter(
			
 
				+                list[ClassPdfExtractionPageData]
			
 
				+            ).validate_json(f.read())
			
 
				     else:
			
 
				         extraction_data = extract_data_from_class_pdf(args.input)
			
 
				         if args.save_intermediate:
			
 
				             logging.info("Saving intermediate data to %s", args.save_intermediate)
			
 
				-            with open(args.save_intermediate, "wb") as f:
			
 
				-                pickle.dump(extraction_data, f)
			
 
				+            with open(args.save_intermediate, "w", encoding="utf-8") as f:
			
 
				+                f.write(
			
 
				+                    TypeAdapter(list[ClassPdfExtractionPageData])
			
 
				+                    .dump_json(extraction_data)
			
 
				+                    .decode("utf-8")
			
 
				+                )
			
 
				             return
			
 
				 
			
 
				     parsed_modules: list[ClassJsonModule] = [