3 ヶ月前 · caf95f95b2
--- a/generate_classes_json.py
+++ b/generate_classes_json.py
@@ -80,7 +80,7 @@ def main() -> None:
 
				     args = parser.parse_args()
			
 
				     lecturers_file = args.lecturers
			
 
				 
			
 
				-    logging.basicConfig(level=logging.DEBUG)
			
 
				+    logging.basicConfig(level=logging.INFO)
			
 
				 
			
 
				     valid_lecturer_shorthands: list[str] | None = None
			
 
				     if lecturers_file:
			
--- a/parse/models.py
+++ b/parse/models.py
@@ -1,7 +1,10 @@
 
				 from enum import Enum, unique
			
 
				 from typing import Annotated, Any
			
 
				+from dataclasses import dataclass
			
 
				 
			
 
				 from pydantic import BaseModel, PlainSerializer, Field, ConfigDict, BeforeValidator
			
 
				+from pdfplumber.page import Page
			
 
				+from pdfplumber.table import Table
			
 
				 
			
 
				 
			
 
				 class XLevel(BaseModel):
			
@@ -194,6 +197,7 @@ class ClassPdfExtractionPageData(BaseModel):
 
				     raw_extracted_modules: list[RawExtractedModule]
			
 
				     page_metadata: PageMetadata
			
 
				 
			
 
				+
			
 
				 class RawClassPdfExtractionPageData(BaseModel):
			
 
				     raw_extracted_modules: list[RawExtractedModule]
			
 
				     above_table_text: str
			
@@ -202,3 +206,13 @@ class RawClassPdfExtractionPageData(BaseModel):
 
				 class StartsWithMatch(BaseModel):
			
 
				     shorthand_found: str
			
 
				     num_of_matches: int
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TimeSlotYLevelsCollectionData:
			
 
				+    row_index: int
			
 
				+    page: Page
			
 
				+    table: Table
			
 
				+    expected_timeslot_index: int
			
 
				+    last_timeslot: TimeSlot
			
 
				+    weekday_areas: dict[Weekday, Area]
			
--- a/parse/parse_modules.py
+++ b/parse/parse_modules.py
@@ -268,4 +268,3 @@ def get_classes(extraction_data: list[ClassPdfExtractionPageData]) -> list[str]:
 
				     Get the classes from the class page's metadata.
			
 
				     """
			
 
				     return [page_data.page_metadata.class_name for page_data in extraction_data]
			
 
				-
			
--- a/parse/table_extraction.py
+++ b/parse/table_extraction.py
@@ -22,6 +22,7 @@ from .models import (
 
				     ClassPdfExtractionPageData,
			
 
				     RawClassPdfExtractionPageData,
			
 
				     PageMetadata,
			
 
				+    TimeSlotYLevelsCollectionData,
			
 
				 )
			
 
				 from .above_table_text import parse_above_table_text
			
 
				 from .geometry import (
			
@@ -38,6 +39,16 @@ allowed_time_slots: list[TimeSlot] = [
 
				 ]
			
 
				 
			
 
				 
			
 
				+def find_next_cell_below_index(current_area: Area, cells: list[Area]) -> int:
			
 
				+    """
			
 
				+    Returns the index of the first cell directly below current_area, or -1 if none.
			
 
				+    """
			
 
				+    for index, cell in enumerate(cells):
			
 
				+        if is_area_below(cell, current_area):
			
 
				+            return index
			
 
				+    return -1
			
 
				+
			
 
				+
			
 
				 def get_weekday_from_text(text: str) -> Weekday | None:
			
 
				     """
			
 
				     Helper function that tries to get a Weekday from a string.
			
@@ -49,6 +60,56 @@ def get_weekday_from_text(text: str) -> Weekday | None:
 
				     return None
			
 
				 
			
 
				 
			
 
				+def merge_vertically_spanning_cells(
			
 
				+    initial_area: Area,
			
 
				+    remaining_cells: list[Area],
			
 
				+    horizontal_lines: list[HorizontalLine],
			
 
				+    highest_y: float,
			
 
				+    weekday: Weekday,
			
 
				+) -> Area:
			
 
				+    """
			
 
				+    Merges vertically adjacent cells until a bottom boundary (line or page end) is found.
			
 
				+    Mutates remaining_cells by removing used cells.
			
 
				+    Returns the final merged area.
			
 
				+    """
			
 
				+    current_area = initial_area
			
 
				+
			
 
				+    while True:
			
 
				+        logging.debug(
			
 
				+            "Searching for bottom boundary of area: %s on %s", current_area, weekday
			
 
				+        )
			
 
				+
			
 
				+        # case 1: horizontal line at the bottom of current area?
			
 
				+        if any(
			
 
				+            is_line_at_bottom(current_area, line, tolerance=20)
			
 
				+            for line in horizontal_lines
			
 
				+        ):
			
 
				+            logging.debug("Bottom boundary found: horizontal line")
			
 
				+            return current_area
			
 
				+
			
 
				+        # case 2: reached the bottom of the timetable?
			
 
				+        if is_vertical_match(current_area.y2, highest_y):
			
 
				+            logging.debug("Bottom boundary found: highest y level")
			
 
				+            return current_area
			
 
				+
			
 
				+        # case 3: find and merge with the next cell below
			
 
				+        next_cell_index = find_next_cell_below_index(current_area, remaining_cells)
			
 
				+        if next_cell_index == -1:
			
 
				+            raise RuntimeError(
			
 
				+                f"No bottom boundary or next cell found for module on {weekday}"
			
 
				+            )
			
 
				+
			
 
				+        next_cell = remaining_cells.pop(next_cell_index)
			
 
				+        logging.debug("Vertically merging with cell below: %s", next_cell)
			
 
				+
			
 
				+        current_area = Area(
			
 
				+            x1=current_area.x1,
			
 
				+            y1=current_area.y1,
			
 
				+            x2=next_cell.x2,  # use the wider x2 in case of a slight misalignment
			
 
				+            y2=next_cell.y2,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				 def get_modules_from_weekday(
			
 
				     weekday: Weekday,
			
 
				     unmerged_time_entries: UnmergedTimeEntries,
			
@@ -57,71 +118,71 @@ def get_modules_from_weekday(
 
				     page_number: int,
			
 
				 ) -> list[RawExtractedModule]:
			
 
				     """
			
 
				-    Extracts the modules (raw text and start/end) of a weekday on a single pdf page
			
 
				+    Extracts the modules (raw text and start/end) of a weekday on a single pdf page.
			
 
				     """
			
 
				-    highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				-    modules = []
			
 
				-    while len(unmerged_time_entries.cells) > 0:
			
 
				-        area = unmerged_time_entries.cells.pop(0)
			
 
				-        if is_mostly_white_area(page, area):
			
 
				+    cells = unmerged_time_entries.cells[:]
			
 
				+    horizontal_lines = unmerged_time_entries.horizontal_lines
			
 
				+
			
 
				+    highest_y: float = get_highest_y_level(timeslot_y_levels, page_number)
			
 
				+    modules: list[RawExtractedModule] = []
			
 
				+    while cells:
			
 
				+        initial_area = cells.pop(0)
			
 
				+
			
 
				+        if is_mostly_white_area(page, initial_area):
			
 
				             logging.debug("mostly white cell skipped")
			
 
				             continue
			
 
				-        timeslot = get_timeslot_for_area(area, timeslot_y_levels)
			
 
				-        if timeslot is None:
			
 
				-            raise RuntimeError("Could not match TimeSlot to Cell Area")
			
 
				-        start_seconds = timeslot.start_seconds()
			
 
				-        line_at_bottom_found = False
			
 
				-        while not line_at_bottom_found:
			
 
				-            logging.debug("searching for line at bottom of: %s", area)
			
 
				-            logging.debug("line candidates:")
			
 
				-            for line in unmerged_time_entries.horizontal_lines:
			
 
				-                logging.debug("testing horizontal line: %s", line)
			
 
				-                if is_line_at_bottom(area, line, tolerance=20):
			
 
				-                    line_at_bottom_found = True
			
 
				-                    logging.debug("candidate line found")
			
 
				-                    break
			
 
				-            else:
			
 
				-
			
 
				-                if is_vertical_match(area.y2, highest_y_level):
			
 
				-                    logging.debug("highest y level matched")
			
 
				-                    break
			
 
				-                found_matching_next_cell_index = -1
			
 
				-                for index, potential_cell_below in enumerate(
			
 
				-                    unmerged_time_entries.cells
			
 
				-                ):
			
 
				-                    if is_area_below(potential_cell_below, area):
			
 
				-                        found_matching_next_cell_index = index
			
 
				-                        break
			
 
				-                else:
			
 
				-                    raise RuntimeError(
			
 
				-                        f"No matching cell below found to merge with on {weekday}"
			
 
				-                    )
			
 
				-                logging.debug("vertically merging cells for %s", weekday)
			
 
				-                matched_area = unmerged_time_entries.cells.pop(
			
 
				-                    found_matching_next_cell_index
			
 
				-                )
			
 
				-                logging.debug("matched cell area: %s", matched_area)
			
 
				-                area = Area(
			
 
				-                    x1=area.x1, y1=area.y1, x2=matched_area.x2, y2=matched_area.y2
			
 
				-                )
			
 
				 
			
 
				-        text = page.crop((area.x1, area.y1, area.x2, area.y2)).extract_text()
			
 
				-        timeslot = get_timeslot_for_area(area, timeslot_y_levels)
			
 
				-        if timeslot is None:
			
 
				-            raise RuntimeError("Could not match TimeSlot to Cell Area")
			
 
				-        end_seconds = timeslot.end_seconds()
			
 
				+        merged_area: Area = merge_vertically_spanning_cells(
			
 
				+            initial_area, cells, horizontal_lines, highest_y, weekday
			
 
				+        )
			
 
				+
			
 
				+        start_timeslot = get_timeslot_for_area(initial_area, timeslot_y_levels)
			
 
				+        if start_timeslot is None:
			
 
				+            raise RuntimeError(
			
 
				+                f"Could not determine start timeslot for module on {weekday}"
			
 
				+            )
			
 
				+
			
 
				+        end_timeslot = get_timeslot_for_area(merged_area, timeslot_y_levels)
			
 
				+        if end_timeslot is None:
			
 
				+            raise RuntimeError(
			
 
				+                f"Could not determine end timeslot for merged module on {weekday}"
			
 
				+            )
			
 
				+
			
 
				+        text: str = (
			
 
				+            page.crop(
			
 
				+                (merged_area.x1, merged_area.y1, merged_area.x2, merged_area.y2)
			
 
				+            ).extract_text()
			
 
				+            or ""  # do not raise error when extraction returns None for now
			
 
				+        )
			
 
				+
			
 
				         modules.append(
			
 
				             RawExtractedModule(
			
 
				                 weekday=weekday,
			
 
				-                start_seconds=start_seconds,
			
 
				-                end_seconds=end_seconds,
			
 
				+                start_seconds=start_timeslot.start_seconds(),
			
 
				+                end_seconds=end_timeslot.end_seconds(),
			
 
				                 text=text,
			
 
				                 source_page_number=page_number,
			
 
				             )
			
 
				         )
			
 
				+
			
 
				     return modules
			
 
				 
			
 
				 
			
 
				+def get_highest_y_level(timeslot_y_levels, page_number) -> float:
			
 
				+    """
			
 
				+    Gets the highest `YLevel` of all `TimeSlot`'s.
			
 
				+
			
 
				+    Raises:
			
 
				+        RuntimeError: If no the highest allowed `TimeSlot` was not mapped to a `YLevel`
			
 
				+    """
			
 
				+    try:
			
 
				+        highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				+    except KeyError as e:
			
 
				+        logging.debug("timeslot_y_levels on page %d %s", page_number, timeslot_y_levels)
			
 
				+        raise RuntimeError("Could not get YLevel for latest TimeSlot") from e
			
 
				+    return highest_y_level
			
 
				+
			
 
				+
			
 
				 def get_usable_table_index(found_tables: list) -> int:
			
 
				     """
			
 
				     Identifies the index of the timetable on the page based on dimensions.
			
@@ -170,73 +231,16 @@ def process_page(
 
				     """
			
 
				     with pdfplumber.open(input_filename) as pdf:
			
 
				         page = pdf.pages[page_index]
			
 
				-        weekday_areas: dict[Weekday, Area] = {}
			
 
				         timeslot_y_levels: dict[TimeSlot, YLevel] = {}
			
 
				         unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries] = {}
			
 
				-
			
 
				-        for day in Weekday:
			
 
				-            weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
			
 
				+        weekday_areas: dict[Weekday, Area] = init_weekday_areas()
			
 
				 
			
 
				         table: Table = select_main_table(page, page_index)
			
 
				-        text_above_table = get_above_table_text(page, table_y1=table.bbox[1])
			
 
				-
			
 
				-        empty_start_found = False
			
 
				+        text_above_table: str = get_above_table_text(page, table_y1=table.bbox[1])
			
 
				 
			
 
				-        # get weekday and timeslot areas
			
 
				-        expected_timeslot_index = 0
			
 
				-        for row_index, row in enumerate(table.rows):
			
 
				-            if row_index == 0:
			
 
				-                for column_index, cell in enumerate(row.cells):
			
 
				-                    logging.debug("row: %d, col: %d", row_index, column_index)
			
 
				-                    logging.debug(cell)
			
 
				-                    if cell is None:
			
 
				-                        logging.debug("None Table Cell Found")
			
 
				-                    else:
			
 
				-                        cell_text = page.crop(
			
 
				-                            (cell[0], cell[1], cell[2], cell[3])
			
 
				-                        ).extract_text()
			
 
				-                        if not empty_start_found and len(cell_text) == 0:
			
 
				-                            logging.debug("empty start found")
			
 
				-                            empty_start_found = True
			
 
				-
			
 
				-                        weekday_enum = get_weekday_from_text(cell_text)
			
 
				-                        if weekday_enum:
			
 
				-                            logging.debug("Weekday %s found", cell_text)
			
 
				-                            weekday_areas[weekday_enum] = Area(
			
 
				-                                x1=cell[0], y1=cell[3], x2=cell[2], y2=0
			
 
				-                            )
			
 
				-            else:
			
 
				-                logging.debug("row: %d, col: %d", row_index, 0)
			
 
				-                cell = row.cells[0]
			
 
				-                if cell is None:
			
 
				-                    logging.warning("Unexpected None Table Cell Found")
			
 
				-                else:
			
 
				-                    cell_text = page.crop(
			
 
				-                        (cell[0], cell[1], cell[2], cell[3])
			
 
				-                    ).extract_text()
			
 
				-                    target_timeslot = allowed_time_slots[expected_timeslot_index]
			
 
				-                    if not (
			
 
				-                        target_timeslot.start_time in cell_text
			
 
				-                        and target_timeslot.end_time in cell_text
			
 
				-                    ):
			
 
				-                        logging.warning("Unexpected Timeslot found: '%s'", cell_text)
			
 
				-                    else:
			
 
				-                        # assumes this is the last timeslot ever
			
 
				-                        if target_timeslot == TimeSlot(
			
 
				-                            start_time="20:30", end_time="21:15"
			
 
				-                        ):
			
 
				-                            for weekday in Weekday:
			
 
				-                                new_area = Area(
			
 
				-                                    x1=weekday_areas[weekday].x1,
			
 
				-                                    y1=weekday_areas[weekday].y1,
			
 
				-                                    x2=weekday_areas[weekday].x2,
			
 
				-                                    y2=cell[3],
			
 
				-                                )
			
 
				-                                weekday_areas[weekday] = new_area
			
 
				-                        timeslot_y_levels[target_timeslot] = YLevel(
			
 
				-                            y1=cell[1], y2=cell[3]
			
 
				-                        )
			
 
				-                        expected_timeslot_index += 1
			
 
				+        collect_weekday_areas_and_timeslot_y_levels(
			
 
				+            weekday_areas, timeslot_y_levels, page, table
			
 
				+        )
			
 
				 
			
 
				         collected_unmerged_time_entries_by_weekday(
			
 
				             unmerged_time_entries_by_weekday, weekday_areas, table, page
			
@@ -257,6 +261,122 @@ def process_page(
 
				             raw_extracted_modules=all_modules, above_table_text=text_above_table
			
 
				         )
			
 
				 
			
 
				+
			
 
				+def collect_weekday_areas_and_timeslot_y_levels(
			
 
				+    weekday_areas: dict[Weekday, Area],
			
 
				+    timeslot_y_levels: dict[TimeSlot, YLevel],
			
 
				+    page: Page,
			
 
				+    table: Table,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Populates the passed weekday_areas and timeslot_y_levels dicts with the right
			
 
				+    `Area`'s by `Weekday` and `YLevel` by TimeSlot respectively, via side effects.
			
 
				+    """
			
 
				+    expected_timeslot_index = 0
			
 
				+    for row_index, row in enumerate(table.rows):
			
 
				+        if row_index == 0:
			
 
				+            collect_weekday_areas(weekday_areas, page, row, row_index)
			
 
				+        else:
			
 
				+            expected_timeslot_index: int = collect_timeslot_y_levels_of_row(
			
 
				+                timeslot_y_levels,
			
 
				+                TimeSlotYLevelsCollectionData(
			
 
				+                    row_index=row_index,
			
 
				+                    expected_timeslot_index=expected_timeslot_index,
			
 
				+                    last_timeslot=get_last_timeslot(allowed_time_slots),
			
 
				+                    page=page,
			
 
				+                    table=table,
			
 
				+                    weekday_areas=weekday_areas,
			
 
				+                ),
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+def collect_timeslot_y_levels_of_row(
			
 
				+    timeslot_y_levels: dict[TimeSlot, YLevel],
			
 
				+    collection_data: TimeSlotYLevelsCollectionData,
			
 
				+) -> int:
			
 
				+    """
			
 
				+    Populates the passed and timeslot_y_levels dicts with the right
			
 
				+    `YLevel`'s by `TimeSlot` via side effects.
			
 
				+
			
 
				+    Returns:
			
 
				+        int for the current expected `TimeSlot` index
			
 
				+    """
			
 
				+    logging.debug("row: %d, col: %d", collection_data.row_index, 0)
			
 
				+    row = collection_data.table.rows[collection_data.row_index]
			
 
				+    cell = row.cells[0]
			
 
				+    if cell is None:
			
 
				+        logging.warning("None Table cell found, not collecting YLevel of Row")
			
 
				+        return collection_data.expected_timeslot_index
			
 
				+    cell_text = collection_data.page.crop(
			
 
				+        (cell[0], cell[1], cell[2], cell[3])
			
 
				+    ).extract_text()
			
 
				+    target_timeslot = allowed_time_slots[collection_data.expected_timeslot_index]
			
 
				+    if not (
			
 
				+        target_timeslot.start_time in cell_text
			
 
				+        and target_timeslot.end_time in cell_text
			
 
				+    ):
			
 
				+        logging.warning("Unexpected TimeSlot found: '%s'", cell_text)
			
 
				+        return collection_data.expected_timeslot_index
			
 
				+    if target_timeslot == collection_data.last_timeslot:
			
 
				+        for weekday in Weekday:
			
 
				+            new_area = Area(
			
 
				+                x1=collection_data.weekday_areas[weekday].x1,
			
 
				+                y1=collection_data.weekday_areas[weekday].y1,
			
 
				+                x2=collection_data.weekday_areas[weekday].x2,
			
 
				+                y2=cell[3],
			
 
				+            )
			
 
				+            collection_data.weekday_areas[weekday] = new_area
			
 
				+    timeslot_y_levels[target_timeslot] = YLevel(y1=cell[1], y2=cell[3])
			
 
				+    return collection_data.expected_timeslot_index + 1
			
 
				+
			
 
				+
			
 
				+def collect_weekday_areas(weekday_areas, page, row, row_index) -> None:
			
 
				+    """
			
 
				+    Populates the passed weekday_areas dict with the right
			
 
				+    `Area`'s by `Weekday` via side effects.
			
 
				+    """
			
 
				+    empty_start_found = False
			
 
				+    for column_index, cell in enumerate(row.cells):
			
 
				+        logging.debug("row: %d, col: %d", row_index, column_index)
			
 
				+        logging.debug(cell)
			
 
				+        if cell is None:
			
 
				+            logging.debug("None Table Cell Found")
			
 
				+        else:
			
 
				+            cell_text = page.crop((cell[0], cell[1], cell[2], cell[3])).extract_text()
			
 
				+            if not empty_start_found and len(cell_text) == 0:
			
 
				+                logging.debug("empty start found")
			
 
				+                empty_start_found = True
			
 
				+
			
 
				+            weekday_enum: Weekday | None = get_weekday_from_text(cell_text)
			
 
				+            if weekday_enum:
			
 
				+                logging.debug("Weekday %s found", cell_text)
			
 
				+                weekday_areas[weekday_enum] = Area(
			
 
				+                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
			
 
				+                )
			
 
				+
			
 
				+
			
 
				+def get_last_timeslot(time_slots: list[TimeSlot]) -> TimeSlot:
			
 
				+    """
			
 
				+    Get the last timeslot a weekday can have.
			
 
				+    """
			
 
				+    if len(time_slots) == 0:
			
 
				+        raise RuntimeError("Cannot get the latest timeslot from an empty list")
			
 
				+    last_timeslot = time_slots[-1]
			
 
				+    logging.debug("last timeslot found: %s", last_timeslot)
			
 
				+
			
 
				+    return last_timeslot
			
 
				+
			
 
				+
			
 
				+def init_weekday_areas() -> dict[Weekday, Area]:
			
 
				+    """
			
 
				+    Initializes the weekday areas with zero-valued `Area`'s for each `Weekday`
			
 
				+    """
			
 
				+    weekday_areas: dict[Weekday, Area] = {}
			
 
				+    for day in Weekday:
			
 
				+        weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
			
 
				+    return weekday_areas
			
 
				+
			
 
				+
			
 
				 def select_main_table(page: Page, page_index: int) -> Table:
			
 
				     """
			
 
				     Selects the main table on the PDF Page. This should be the timetable.
			
@@ -279,7 +399,7 @@ def collected_unmerged_time_entries_by_weekday(
 
				 ) -> None:
			
 
				     """
			
 
				     Populates the passed unmerged_time_entries_by_weekday dict with the
			
 
				-    `UnmergedTimeEntries` by `Weekday`.
			
 
				+    `UnmergedTimeEntries` by `Weekday` via side effects.
			
 
				     """
			
 
				     for weekday in Weekday:
			
 
				         unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
			
@@ -319,8 +439,8 @@ def collect_horizontal_lines(
 
				 ) -> None:
			
 
				     """
			
 
				     Populates the passed unmerged_time_entries_by_weekday dict with the
			
 
				-    `horizontal_lines` of the `UnmergedTimeEntries` by the passed weekday.
			
 
				-    These horizontal Lines are timeslot seperator lines.
			
 
				+    `horizontal_lines` of the `UnmergedTimeEntries` by the passed weekday
			
 
				+    via side effects. These horizontal Lines are timeslot seperator lines.
			
 
				     """
			
 
				     for line_found in page.lines:
			
 
				         line_x1 = line_found["x0"]