فهرست منبع

now use json for the intermediate save after pdf extraction

Noah Vogt 2 ماه پیش
والد
کامیت
d743e16db0
7فایلهای تغییر یافته به همراه100 افزوده شده و 68 حذف شده
  1. 2 2
      README.md
  2. 13 3
      parse/above_table_text.py
  3. 4 2
      parse/geometry.py
  4. 31 36
      parse/models.py
  5. 3 1
      parse/parse_modules.py
  6. 32 15
      parse/table_extraction.py
  7. 15 9
      parse_class_pdf.py

+ 2 - 2
README.md

@@ -33,8 +33,8 @@ Execute the following to parse a class timetable PDF into the `classes.json` fil
 - `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
 - `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
 - `-l`, `--lecturers`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
-- `--save-intermediate`: Path to save the intermediate extraction data (pickle format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
-- `--load-intermediate`: Path to load the intermediate extraction data from (pickle format) and skip extraction.
+- `--save-intermediate`: Path to save the intermediate extraction data (JSON format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
+- `--load-intermediate`: Path to load the intermediate extraction data from (JSON format) and skip extraction.
 
 The default values for input and output files are defined in `config/constants.py`.
 

+ 13 - 3
parse/above_table_text.py

@@ -20,14 +20,21 @@ def parse_above_table_text(
         raise RuntimeError("Invalid Number of Lines.")
 
     semester_type: SemesterType = get_semester_value(lines[0])
-    semester: Semester = Semester(get_semester_year(lines[0]), semester_type)
+    semester: Semester = Semester(
+        yyyy=get_semester_year(lines[0]), semester_type=semester_type
+    )
     class_name: str = get_class_name(lines[2])
     degree_program: DegreeProgram = get_degree_program(
         lines[2], class_name, previous_page_metadata
     )
     export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
 
-    return PageMetadata(semester, export_timestamp, class_name, degree_program)
+    return PageMetadata(
+        semester=semester,
+        export_timestamp=export_timestamp,
+        class_name=class_name,
+        degree_program=degree_program,
+    )
 
 
 def get_export_timestamp(second_line: str) -> ExportTimestamp:
@@ -51,7 +58,10 @@ def get_export_timestamp(second_line: str) -> ExportTimestamp:
     else:
         raise RuntimeError("No Time found")
 
-    return ExportTimestamp(Date(date_yyyy, date_mm, date_dd), Time(time_hh, time_mm))
+    return ExportTimestamp(
+        date=Date(yyyy=date_yyyy, mm=date_mm, dd=date_dd),
+        time=Time(hh=time_hh, mm=time_mm),
+    )
 
 
 def get_class_name(third_line: str) -> str:

+ 4 - 2
parse/geometry.py

@@ -27,7 +27,9 @@ def is_line_at_bottom(
     if not is_vertical_match(line.y, area.y2, tolerance):
         return False
 
-    return has_horizontal_overlap(XLevel(line.x1, line.x2), XLevel(area.x1, area.x2))
+    return has_horizontal_overlap(
+        XLevel(x1=line.x1, x2=line.x2), XLevel(x1=area.x1, x2=area.x2)
+    )
 
 
 def is_area_below(area1: Area, area2: Area, tolerance: float = TOLERANCE) -> bool:
@@ -39,7 +41,7 @@ def is_area_below(area1: Area, area2: Area, tolerance: float = TOLERANCE) -> boo
         return False
 
     return has_horizontal_overlap(
-        XLevel(area1.x1, area1.x2), XLevel(area2.x1, area2.x2)
+        XLevel(x1=area1.x1, x2=area1.x2), XLevel(x1=area2.x1, x2=area2.x2)
     )
 
 

+ 31 - 36
parse/models.py

@@ -1,31 +1,26 @@
 from enum import Enum, unique
-from dataclasses import dataclass
-from typing import Annotated
+from typing import Annotated, Any
 
-from pydantic import BaseModel, PlainSerializer, Field, ConfigDict
+from pydantic import BaseModel, PlainSerializer, Field, ConfigDict, BeforeValidator
 
 
-@dataclass
-class XLevel:
+class XLevel(BaseModel):
     x1: float
     x2: float
 
 
-@dataclass
-class YLevel:
+class YLevel(BaseModel):
     y1: float
     y2: float
 
 
-@dataclass
-class HorizontalLine:
+class HorizontalLine(BaseModel):
     x1: float
     x2: float
     y: float
 
 
-@dataclass
-class Area:
+class Area(BaseModel):
     x1: float
     y1: float
     x2: float
@@ -53,27 +48,24 @@ class SemesterType(Enum):
     FS = "Frühlingssemester"
 
 
-@dataclass(frozen=True)
-class Semester:
+class Semester(BaseModel):
+    model_config = ConfigDict(frozen=True)
     yyyy: int
     semester_type: SemesterType
 
 
-@dataclass
-class Date:
+class Date(BaseModel):
     yyyy: int
     mm: int
     dd: int
 
 
-@dataclass
-class Time:
+class Time(BaseModel):
     hh: int
     mm: int
 
 
-@dataclass
-class ExportTimestamp:
+class ExportTimestamp(BaseModel):
     date: Date
     time: Time
 
@@ -96,22 +88,20 @@ class DegreeProgram(Enum):
     AGNOSTIC = "SG-???"
 
 
-@dataclass
-class PageMetadata:
+class PageMetadata(BaseModel):
     semester: Semester
     export_timestamp: ExportTimestamp
     class_name: str
     degree_program: DegreeProgram
 
 
-@dataclass
-class UnmergedTimeEntries:
+class UnmergedTimeEntries(BaseModel):
     cells: list[Area]
     horizontal_lines: list[HorizontalLine]
 
 
-@dataclass(frozen=True)
-class TimeSlot:
+class TimeSlot(BaseModel):
+    model_config = ConfigDict(frozen=True)
     start_time: str
     end_time: str
 
@@ -124,9 +114,18 @@ class TimeSlot:
         return hours * 3600 + minutes * 60
 
 
-@dataclass
-class RawExtractedModule:
-    weekday: Weekday
+def to_tuple_if_list(v: Any) -> Any:
+    if isinstance(v, list):
+        return tuple(v)
+    return v
+
+
+# needed for pydantic to correctly parse the custom Weekday Enum
+TolerantWeekday = Annotated[Weekday, BeforeValidator(to_tuple_if_list)]
+
+
+class RawExtractedModule(BaseModel):
+    weekday: TolerantWeekday
     start_seconds: int
     end_seconds: int
     text: str
@@ -141,8 +140,7 @@ class TeachingType(Enum):
     BLOCK = "blockmodule"
 
 
-@dataclass
-class RawLecturer:
+class RawLecturer(BaseModel):
     """
     Basic representation of an extracted lecturer from a pdf that needs to be parsed.
     """
@@ -166,8 +164,7 @@ class Lecturer(BaseModel):
 CustomWeekday = Annotated[Weekday, PlainSerializer(lambda v: v.index, return_type=int)]
 
 
-@dataclass
-class ParsedModuleCellTextData:
+class ParsedModuleCellTextData(BaseModel):
     module_shorthand: str
     degree_program: DegreeProgram
     class_name: str
@@ -193,13 +190,11 @@ class ClassJsonModule(BaseModel):
     lecturer_shorthands: list[str] = Field(..., alias="teachers")
 
 
-@dataclass
-class ClassPdfExtractionPageData:
+class ClassPdfExtractionPageData(BaseModel):
     raw_extracted_modules: list[RawExtractedModule]
     page_metadata: PageMetadata
 
 
-@dataclass
-class StartsWithMatch:
+class StartsWithMatch(BaseModel):
     shorthand_found: str
     num_of_matches: int

+ 3 - 1
parse/parse_modules.py

@@ -185,7 +185,9 @@ def matches_startswith(
         if shorthand.startswith(word):
             shorthand_with_start = shorthand
             num_of_startwith_matches += 1
-    return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
+    return StartsWithMatch(
+        shorthand_found=shorthand_with_start, num_of_matches=num_of_startwith_matches
+    )
 
 
 def get_module_shorthand(

+ 32 - 15
parse/table_extraction.py

@@ -24,7 +24,8 @@ from .geometry import (
 from .img import is_mostly_white_area
 
 allowed_time_slots: list[TimeSlot] = [
-    TimeSlot(*timeslot_tuple) for timeslot_tuple in ALLOWED_TIMESLOTS
+    TimeSlot(start_time=timeslot_tuple[0], end_time=timeslot_tuple[1])
+    for timeslot_tuple in ALLOWED_TIMESLOTS
 ]
 
 
@@ -101,7 +102,9 @@ def get_modules_from_weekday(
                     found_matching_next_cell_index
                 )
                 logging.debug("matched cell area: %s", matched_area)
-                area = Area(area.x1, area.y1, matched_area.x2, matched_area.y2)
+                area = Area(
+                    x1=area.x1, y1=area.y1, x2=matched_area.x2, y2=matched_area.y2
+                )
 
         text = page.crop((area.x1, area.y1, area.x2, area.y2)).extract_text()
         timeslot = get_timeslot_for_area(area, timeslot_y_levels)
@@ -109,7 +112,13 @@ def get_modules_from_weekday(
             raise RuntimeError("Could not match TimeSlot to Cell Area")
         end_seconds = timeslot.end_seconds()
         modules.append(
-            RawExtractedModule(weekday, start_seconds, end_seconds, text, page_number)
+            RawExtractedModule(
+                weekday=weekday,
+                start_seconds=start_seconds,
+                end_seconds=end_seconds,
+                text=text,
+                source_page_number=page_number,
+            )
         )
     return modules
 
@@ -128,7 +137,7 @@ def extract_data_from_class_pdf(
             weekday_areas: dict[Weekday, Area] = {}
             timeslot_y_levels: dict[TimeSlot, YLevel] = {}
             for day in Weekday:
-                weekday_areas[day] = Area(0, 0, 0, 0)
+                weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
 
             found_tables = page.find_tables(CLASS_PDF_TABLE_SETTINGS)
             logging.debug(
@@ -179,7 +188,7 @@ def extract_data_from_class_pdf(
                             if weekday_enum:
                                 logging.debug("Weekday %s found", cell_text)
                                 weekday_areas[weekday_enum] = Area(
-                                    cell[0], cell[3], cell[2], 0
+                                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
                                 )
                 else:
                     logging.debug("row: %d, col: %d", row_index, 0)
@@ -200,22 +209,26 @@ def extract_data_from_class_pdf(
                             )
                         else:
                             # assumes this is the last timeslot ever
-                            if target_timeslot == TimeSlot("20:30", "21:15"):
+                            if target_timeslot == TimeSlot(
+                                start_time="20:30", end_time="21:15"
+                            ):
                                 for weekday in Weekday:
                                     new_area = Area(
-                                        weekday_areas[weekday].x1,
-                                        weekday_areas[weekday].y1,
-                                        weekday_areas[weekday].x2,
-                                        cell[3],
+                                        x1=weekday_areas[weekday].x1,
+                                        y1=weekday_areas[weekday].y1,
+                                        x2=weekday_areas[weekday].x2,
+                                        y2=cell[3],
                                     )
                                     weekday_areas[weekday] = new_area
                             timeslot_y_levels[target_timeslot] = YLevel(
-                                cell[1], cell[3]
+                                y1=cell[1], y2=cell[3]
                             )
                             expected_timeslot_index += 1
 
             for weekday in Weekday:
-                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries([], [])
+                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
+                    cells=[], horizontal_lines=[]
+                )
                 target_area = weekday_areas[weekday]
                 logging.debug("target_area: %s", target_area)
 
@@ -234,7 +247,9 @@ def extract_data_from_class_pdf(
                         ):
                             cell_dimensions = cell[0], cell[1], cell[2], cell[3]
                             unmerged_time_entries_by_weekday[weekday].cells.append(
-                                Area(*cell_dimensions)
+                                Area(
+                                    x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3]
+                                )
                             )
                             logging.debug("%s cell found", weekday)
 
@@ -254,7 +269,7 @@ def extract_data_from_class_pdf(
                         unmerged_time_entries_by_weekday[
                             weekday
                         ].horizontal_lines.append(
-                            HorizontalLine(line_x1, line_x2, line_bottom)
+                            HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
                         )
 
             all_modules: list[RawExtractedModule] = []
@@ -273,7 +288,9 @@ def extract_data_from_class_pdf(
             )
             previous_page_metadata.append(page_metadata)
             extraction_data.append(
-                ClassPdfExtractionPageData(all_modules, page_metadata)
+                ClassPdfExtractionPageData(
+                    raw_extracted_modules=all_modules, page_metadata=page_metadata
+                )
             )
         return extraction_data
 

+ 15 - 9
parse_class_pdf.py

@@ -2,8 +2,8 @@
 
 import logging
 from argparse import ArgumentParser
-import pickle
 import json
+from pydantic import TypeAdapter
 
 from parse import (
     extract_data_from_class_pdf,
@@ -24,7 +24,7 @@ def get_valid_lecturers(file_path: str) -> list[str]:
     """
     valid_lecturers: list[str] = []
     try:
-        print(f"READING: '{file_path}'")
+        logging.warning("reading lecturers file: '%s'", file_path)
         with open(file_path, "r", encoding="utf-8") as f:
             data = json.load(f)
             if isinstance(data, list):
@@ -55,19 +55,19 @@ def main() -> None:
     )
     parser.add_argument(
         "--save-intermediate",
-        help="Path to save the intermediate extraction data (pickle format) and exit",
+        help="Path to save the intermediate extraction data (JSON format) and exit",
         default=None,
     )
     parser.add_argument(
         "--load-intermediate",
-        help="Path to load the intermediate extraction data from (pickle format) and skip extraction",
+        help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
         default=None,
     )
 
     args = parser.parse_args()
     lecturers_file = args.lecturers
 
-    logging.basicConfig(level=logging.DEBUG)
+    logging.basicConfig(level=logging.INFO)
 
     valid_lecturer_shorthands: list[str] | None = None
     if lecturers_file:
@@ -77,14 +77,20 @@ def main() -> None:
 
     if args.load_intermediate:
         logging.info("Loading intermediate data from %s", args.load_intermediate)
-        with open(args.load_intermediate, "rb") as f:
-            extraction_data = pickle.load(f)
+        with open(args.load_intermediate, "r", encoding="utf-8") as f:
+            extraction_data = TypeAdapter(
+                list[ClassPdfExtractionPageData]
+            ).validate_json(f.read())
     else:
         extraction_data = extract_data_from_class_pdf(args.input)
         if args.save_intermediate:
             logging.info("Saving intermediate data to %s", args.save_intermediate)
-            with open(args.save_intermediate, "wb") as f:
-                pickle.dump(extraction_data, f)
+            with open(args.save_intermediate, "w", encoding="utf-8") as f:
+                f.write(
+                    TypeAdapter(list[ClassPdfExtractionPageData])
+                    .dump_json(extraction_data)
+                    .decode("utf-8")
+                )
             return
 
     parsed_modules: list[ClassJsonModule] = [