3 hónapja · d627c392aa
--- a/README.md
+++ b/README.md
@@ -34,30 +34,22 @@ This project uses specific domain terms that map to the data sources and the gen
 
				 
			
 
				 This section is split into the different script provided by this repository.
			
 
				 
			
 
				-### parse_class_pdf.py
			
 
				+### generate_classes_json.py
			
 
				 
			
 
				 Execute the following to parse a class timetable PDF into the `classes.json` file needed by the frontend.
			
 
				 
			
 
				 ```sh
			
 
				-./parse_class_pdf.py [-h] [-l LECTURERS] [-i INPUT] [-o OUTPUT] [--save-intermediate SAVE_INTERMEDIATE] [--load-intermediate LOAD_INTERMEDIATE]
			
 
				+./generate_classes_json.py -i klassen.pdf -o classes.json
			
 
				 ```
			
 
				 
			
 
				-#### Arguments
			
 
				-
			
 
				-- `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
			
 
				-- `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
			
 
				-- `-l`, `--lecturers`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
			
 
				-- `--save-intermediate`: Path to save the intermediate extraction data (JSON format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
			
 
				-- `--load-intermediate`: Path to load the intermediate extraction data from (JSON format) and skip extraction.
			
 
				-
			
 
				-The default values for input and output files are defined in `config/constants.py`.
			
 
				+For more information, show the cli arguments via `./generate_classes_json.py -h`.
			
 
				 
			
 
				 #### Faster Development Cycle
			
 
				 
			
 
				 Since the PDF extraction takes a significant amount of time, you can split the process into two stages:
			
 
				 
			
 
				-1.  **Stage 1 (Extraction):** Run once and save the result: `./parse_class_pdf.py --save-intermediate data.pkl`
			
 
				-2.  **Stage 2 (Parsing):** Load the saved data and iterate on the parsing logic: `./parse_class_pdf.py --load-intermediate data.pkl --output classes.json`
			
 
				+1.  **Stage 1 (Extraction):** Run once and save the result: `./generate_classes_json.py --save-intermediate data.pkl`
			
 
				+2.  **Stage 2 (Parsing):** Load the saved data and iterate on the parsing logic: `./generate_classes_json.py --load-intermediate data.pkl --output classes.json`
			
 
				 
			
 
				 ### extract_lecturer_shorthands_pdf.py
			
 
				 
			
--- a/config/constants.py
+++ b/config/constants.py
@@ -1,13 +1,14 @@
 
				-CLASS_PDF_INPUT_FILE = "klassen.pdf"
			
 
				-CLASSES_JSON_OUTPUT_FILE = "classes.json"
			
 
				-CLASS_PDF_TABLE_SETTINGS = {
			
 
				+CLASS_TIMETABLE_PDF_INPUT_FILE = "klassen.pdf"
			
 
				+CLASS_TIMETABLE_PDF_TABLE_SETTINGS = {
			
 
				     "vertical_strategy": "lines",
			
 
				     "horizontal_strategy": "lines",
			
 
				     "snap_tolerance": 3,
			
 
				     "join_tolerance": 3,
			
 
				     "edge_min_length": 3,
			
 
				 }
			
 
				-CLASS_PDF_MIN_DIMENSIONS = 150
			
 
				+CLASS_TIMETABLE_PDF_MIN_DIMENSIONS = 150
			
 
				+
			
 
				+CLASSES_JSON_OUTPUT_FILE = "classes.json"
			
 
				 
			
 
				 TOLERANCE = 3
			
 
				 REQUESTS_TIMEOUT = 4
			
--- a/generate_classes_json.py
+++ b/generate_classes_json.py
@@ -15,7 +15,7 @@ from parse import (
 
				     ClassJsonModule,
			
 
				 )
			
 
				 
			
 
				-from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
			
 
				+from config import CLASS_TIMETABLE_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
			
 
				 
			
 
				 
			
 
				 def get_valid_lecturers(file_path: str) -> list[str]:
			
@@ -42,13 +42,16 @@ def get_valid_lecturers(file_path: str) -> list[str]:
 
				 def main() -> None:
			
 
				     parser = ArgumentParser(description="Parse Class Timetable PDF to JSON.")
			
 
				     parser.add_argument(
			
 
				-        "-l", "--lecturers", help="Path to the lecturers.json file", default=None
			
 
				+        "-l",
			
 
				+        "--lecturers",
			
 
				+        help="Path to the lecturers.json file (Optional)",
			
 
				+        default=None,
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-i",
			
 
				         "--input",
			
 
				         help="Path to the input Class Timetable PDF file",
			
 
				-        default=CLASS_PDF_INPUT_FILE,
			
 
				+        default=CLASS_TIMETABLE_PDF_INPUT_FILE,
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-o",
			
@@ -66,11 +69,18 @@ def main() -> None:
 
				         help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
			
 
				         default=None,
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "-j",
			
 
				+        "--jobs",
			
 
				+        help="Number of parallel jobs to use for extraction (default: 1)",
			
 
				+        type=int,
			
 
				+        default=1,
			
 
				+    )
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				     lecturers_file = args.lecturers
			
 
				 
			
 
				-    logging.basicConfig(level=logging.INFO)
			
 
				+    logging.basicConfig(level=logging.DEBUG)
			
 
				 
			
 
				     valid_lecturer_shorthands: list[str] | None = None
			
 
				     if lecturers_file:
			
@@ -85,7 +95,7 @@ def main() -> None:
 
				                 list[ClassPdfExtractionPageData]
			
 
				             ).validate_json(f.read())
			
 
				     else:
			
 
				-        extraction_data = extract_data_from_class_pdf(args.input)
			
 
				+        extraction_data = extract_data_from_class_pdf(args.input, num_of_jobs=args.jobs)
			
 
				         if args.save_intermediate:
			
 
				             logging.info("Saving intermediate data to %s", args.save_intermediate)
			
 
				             with open(args.save_intermediate, "w", encoding="utf-8") as f:
			
--- a/parse/models.py
+++ b/parse/models.py
@@ -194,6 +194,10 @@ class ClassPdfExtractionPageData(BaseModel):
 
				     raw_extracted_modules: list[RawExtractedModule]
			
 
				     page_metadata: PageMetadata
			
 
				 
			
 
				+class RawClassPdfExtractionPageData(BaseModel):
			
 
				+    raw_extracted_modules: list[RawExtractedModule]
			
 
				+    above_table_text: str
			
 
				+
			
 
				 
			
 
				 class StartsWithMatch(BaseModel):
			
 
				     shorthand_found: str
			
--- a/parse/table_extraction.py
+++ b/parse/table_extraction.py
@@ -1,8 +1,16 @@
 
				 import logging
			
 
				+from multiprocessing import Pool
			
 
				+from pathlib import Path
			
 
				+
			
 
				 from pdfplumber.page import Page
			
 
				+from pdfplumber.table import Table
			
 
				 import pdfplumber
			
 
				 
			
 
				-from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS, CLASS_PDF_MIN_DIMENSIONS
			
 
				+from config import (
			
 
				+    CLASS_TIMETABLE_PDF_TABLE_SETTINGS,
			
 
				+    ALLOWED_TIMESLOTS,
			
 
				+    CLASS_TIMETABLE_PDF_MIN_DIMENSIONS,
			
 
				+)
			
 
				 from .models import (
			
 
				     Weekday,
			
 
				     TimeSlot,
			
@@ -12,6 +20,7 @@ from .models import (
 
				     Area,
			
 
				     HorizontalLine,
			
 
				     ClassPdfExtractionPageData,
			
 
				+    RawClassPdfExtractionPageData,
			
 
				     PageMetadata,
			
 
				 )
			
 
				 from .above_table_text import parse_above_table_text
			
@@ -50,17 +59,7 @@ def get_modules_from_weekday(
 
				     """
			
 
				     Extracts the modules (raw text and start/end) of a weekday on a single pdf page
			
 
				     """
			
 
				-    try:
			
 
				-        highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				-    except KeyError:
			
 
				-        logging.warning("Highest allowed timeslot was not found. Trying lower one's.")
			
 
				-        for time_slot in allowed_time_slots[:-1]:
			
 
				-            try:
			
 
				-                highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				-            except KeyError:
			
 
				-                continue
			
 
				-            finally:
			
 
				-                break
			
 
				+    highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				     modules = []
			
 
				     while len(unmerged_time_entries.cells) > 0:
			
 
				         area = unmerged_time_entries.cells.pop(0)
			
@@ -123,176 +122,286 @@ def get_modules_from_weekday(
 
				     return modules
			
 
				 
			
 
				 
			
 
				-def extract_data_from_class_pdf(
			
 
				-    input_filename: str, lecturers_file=None
			
 
				-) -> list[ClassPdfExtractionPageData]:
			
 
				+def get_usable_table_index(found_tables: list) -> int:
			
 
				     """
			
 
				-    Extracts all data from class timetable pdf's
			
 
				+    Identifies the index of the timetable on the page based on dimensions.
			
 
				+
			
 
				+    Raises:
			
 
				+        RuntimeError: If no or multiple tables matching the minimum dimensions are found.
			
 
				+    """
			
 
				+    if not found_tables:
			
 
				+        raise RuntimeError("No matching tables found.")
			
 
				+
			
 
				+    valid_indices = []
			
 
				+    for index, table in enumerate(found_tables):
			
 
				+        x0, top, x1, bottom = table.bbox
			
 
				+        width = x1 - x0
			
 
				+        height = bottom - top
			
 
				+        logging.debug(
			
 
				+            "table num %d: width: %d, height: %d",
			
 
				+            index + 1,
			
 
				+            width,
			
 
				+            height,
			
 
				+        )
			
 
				+        if (
			
 
				+            width >= CLASS_TIMETABLE_PDF_MIN_DIMENSIONS
			
 
				+            and height >= CLASS_TIMETABLE_PDF_MIN_DIMENSIONS
			
 
				+        ):
			
 
				+            valid_indices.append(index)
			
 
				+
			
 
				+    if len(valid_indices) > 1:
			
 
				+        raise RuntimeError(
			
 
				+            f"Found {len(valid_indices)} valid tables, expected at most 1. "
			
 
				+            "Ambiguous table selection."
			
 
				+        )
			
 
				+
			
 
				+    if len(valid_indices) == 1:
			
 
				+        return valid_indices[0]
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def process_page(
			
 
				+    input_filename: Path, page_index: int
			
 
				+) -> RawClassPdfExtractionPageData:
			
 
				+    """
			
 
				+    Process a single page of the PDF to extract modules and header text.
			
 
				+    Designed to be run in a separate process.
			
 
				     """
			
 
				-    extraction_data: list[ClassPdfExtractionPageData] = []
			
 
				-    previous_page_metadata: list[PageMetadata] = []
			
 
				-    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries] = {}
			
 
				     with pdfplumber.open(input_filename) as pdf:
			
 
				-        for page_index, page in enumerate(pdf.pages):
			
 
				-            weekday_areas: dict[Weekday, Area] = {}
			
 
				-            timeslot_y_levels: dict[TimeSlot, YLevel] = {}
			
 
				-            for day in Weekday:
			
 
				-                weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
			
 
				-
			
 
				-            found_tables = page.find_tables(CLASS_PDF_TABLE_SETTINGS)
			
 
				-            logging.debug(
			
 
				-                "amount of tables found on page %d: %d",
			
 
				-                page_index + 1,
			
 
				-                len(found_tables),
			
 
				-            )
			
 
				-            usable_table_index: int = 0
			
 
				-            if len(found_tables) > 1:
			
 
				-                num_of_tables_with_at_least_min_dimensions: int = 0
			
 
				-                for table_index, table in enumerate(found_tables):
			
 
				-                    x0, top, x1, bottom = table.bbox
			
 
				-                    width = x1 - x0
			
 
				-                    height = bottom - top
			
 
				-                    logging.debug(
			
 
				-                        "table num %d: width: %d, height: %d",
			
 
				-                        table_index + 1,
			
 
				-                        width,
			
 
				-                        height,
			
 
				-                    )
			
 
				-                    if width >= CLASS_PDF_MIN_DIMENSIONS and height >= CLASS_PDF_MIN_DIMENSIONS:
			
 
				-                        num_of_tables_with_at_least_min_dimensions += 1
			
 
				-                        usable_table_index = table_index
			
 
				-            table = found_tables[usable_table_index]
			
 
				-            table_y1 = table.bbox[1]
			
 
				-            text_above_table = get_above_table_text(page, table_y1)
			
 
				-
			
 
				-            empty_start_found = False
			
 
				-
			
 
				-            # get weekday and timeslot areas
			
 
				-            expected_timeslot_index = 0
			
 
				-            for row_index, row in enumerate(table.rows):
			
 
				-                if row_index == 0:
			
 
				-                    for column_index, cell in enumerate(row.cells):
			
 
				-                        logging.debug("row: %d, col: %d", row_index, column_index)
			
 
				-                        logging.debug(cell)
			
 
				-                        if cell is None:
			
 
				-                            logging.debug("None Table Cell Found")
			
 
				-                        else:
			
 
				-                            cell_text = page.crop(
			
 
				-                                (cell[0], cell[1], cell[2], cell[3])
			
 
				-                            ).extract_text()
			
 
				-                            if not empty_start_found and len(cell_text) == 0:
			
 
				-                                logging.debug("empty start found")
			
 
				-                                empty_start_found = True
			
 
				-
			
 
				-                            weekday_enum = get_weekday_from_text(cell_text)
			
 
				-                            if weekday_enum:
			
 
				-                                logging.debug("Weekday %s found", cell_text)
			
 
				-                                weekday_areas[weekday_enum] = Area(
			
 
				-                                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
			
 
				-                                )
			
 
				-                else:
			
 
				-                    logging.debug("row: %d, col: %d", row_index, 0)
			
 
				-                    cell = row.cells[0]
			
 
				+        page = pdf.pages[page_index]
			
 
				+        weekday_areas: dict[Weekday, Area] = {}
			
 
				+        timeslot_y_levels: dict[TimeSlot, YLevel] = {}
			
 
				+        unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries] = {}
			
 
				+
			
 
				+        for day in Weekday:
			
 
				+            weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
			
 
				+
			
 
				+        table: Table = select_main_table(page, page_index)
			
 
				+        text_above_table = get_above_table_text(page, table_y1=table.bbox[1])
			
 
				+
			
 
				+        empty_start_found = False
			
 
				+
			
 
				+        # get weekday and timeslot areas
			
 
				+        expected_timeslot_index = 0
			
 
				+        for row_index, row in enumerate(table.rows):
			
 
				+            if row_index == 0:
			
 
				+                for column_index, cell in enumerate(row.cells):
			
 
				+                    logging.debug("row: %d, col: %d", row_index, column_index)
			
 
				+                    logging.debug(cell)
			
 
				                     if cell is None:
			
 
				-                        logging.warning("Unexpected None Table Cell Found")
			
 
				+                        logging.debug("None Table Cell Found")
			
 
				                     else:
			
 
				                         cell_text = page.crop(
			
 
				                             (cell[0], cell[1], cell[2], cell[3])
			
 
				                         ).extract_text()
			
 
				-                        target_timeslot = allowed_time_slots[expected_timeslot_index]
			
 
				-                        if not (
			
 
				-                            target_timeslot.start_time in cell_text
			
 
				-                            and target_timeslot.end_time in cell_text
			
 
				-                        ):
			
 
				-                            logging.warning(
			
 
				-                                "Unexpected Timeslot found: '%s'", cell_text
			
 
				-                            )
			
 
				-                        else:
			
 
				-                            # assumes this is the last timeslot ever
			
 
				-                            if target_timeslot == TimeSlot(
			
 
				-                                start_time="20:30", end_time="21:15"
			
 
				-                            ):
			
 
				-                                for weekday in Weekday:
			
 
				-                                    new_area = Area(
			
 
				-                                        x1=weekday_areas[weekday].x1,
			
 
				-                                        y1=weekday_areas[weekday].y1,
			
 
				-                                        x2=weekday_areas[weekday].x2,
			
 
				-                                        y2=cell[3],
			
 
				-                                    )
			
 
				-                                    weekday_areas[weekday] = new_area
			
 
				-                            timeslot_y_levels[target_timeslot] = YLevel(
			
 
				-                                y1=cell[1], y2=cell[3]
			
 
				-                            )
			
 
				-                            expected_timeslot_index += 1
			
 
				+                        if not empty_start_found and len(cell_text) == 0:
			
 
				+                            logging.debug("empty start found")
			
 
				+                            empty_start_found = True
			
 
				 
			
 
				-            for weekday in Weekday:
			
 
				-                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
			
 
				-                    cells=[], horizontal_lines=[]
			
 
				-                )
			
 
				-                target_area = weekday_areas[weekday]
			
 
				-                logging.debug("target_area: %s", target_area)
			
 
				-
			
 
				-                for row_index, row in enumerate(table.rows):
			
 
				-                    for column_index, cell in enumerate(row.cells):
			
 
				-                        if cell is None:
			
 
				-                            logging.debug("None table cell found")
			
 
				-                            continue
			
 
				-                        logging.debug("row: %d, col: %d", row_index, column_index)
			
 
				-                        logging.debug("cell: %s", cell)
			
 
				-                        if (
			
 
				-                            target_area.x1 <= cell[0]
			
 
				-                            and target_area.y1 <= cell[1]
			
 
				-                            and target_area.x2 >= cell[2]
			
 
				-                            and target_area.y2 >= cell[3]
			
 
				+                        weekday_enum = get_weekday_from_text(cell_text)
			
 
				+                        if weekday_enum:
			
 
				+                            logging.debug("Weekday %s found", cell_text)
			
 
				+                            weekday_areas[weekday_enum] = Area(
			
 
				+                                x1=cell[0], y1=cell[3], x2=cell[2], y2=0
			
 
				+                            )
			
 
				+            else:
			
 
				+                logging.debug("row: %d, col: %d", row_index, 0)
			
 
				+                cell = row.cells[0]
			
 
				+                if cell is None:
			
 
				+                    logging.warning("Unexpected None Table Cell Found")
			
 
				+                else:
			
 
				+                    cell_text = page.crop(
			
 
				+                        (cell[0], cell[1], cell[2], cell[3])
			
 
				+                    ).extract_text()
			
 
				+                    target_timeslot = allowed_time_slots[expected_timeslot_index]
			
 
				+                    if not (
			
 
				+                        target_timeslot.start_time in cell_text
			
 
				+                        and target_timeslot.end_time in cell_text
			
 
				+                    ):
			
 
				+                        logging.warning("Unexpected Timeslot found: '%s'", cell_text)
			
 
				+                    else:
			
 
				+                        # assumes this is the last timeslot ever
			
 
				+                        if target_timeslot == TimeSlot(
			
 
				+                            start_time="20:30", end_time="21:15"
			
 
				                         ):
			
 
				-                            cell_dimensions = cell[0], cell[1], cell[2], cell[3]
			
 
				-                            unmerged_time_entries_by_weekday[weekday].cells.append(
			
 
				-                                Area(
			
 
				-                                    x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3]
			
 
				+                            for weekday in Weekday:
			
 
				+                                new_area = Area(
			
 
				+                                    x1=weekday_areas[weekday].x1,
			
 
				+                                    y1=weekday_areas[weekday].y1,
			
 
				+                                    x2=weekday_areas[weekday].x2,
			
 
				+                                    y2=cell[3],
			
 
				                                 )
			
 
				-                            )
			
 
				-                            logging.debug("%s cell found", weekday)
			
 
				-
			
 
				-                for line_found in page.lines:
			
 
				-                    line_x1 = line_found["x0"]
			
 
				-                    line_x2 = line_found["x1"]
			
 
				-                    line_y1 = line_found["y0"]
			
 
				-                    line_y2 = line_found["y1"]
			
 
				-                    line_bottom = line_found["bottom"]
			
 
				-
			
 
				-                    # ignore non horizontal lines
			
 
				-                    if line_y1 != line_y2:
			
 
				-                        continue
			
 
				-
			
 
				-                    if target_area.x1 <= line_x1 and target_area.x2 >= line_x2:
			
 
				-                        logging.debug("%s timeslot seperator line found", weekday)
			
 
				-                        unmerged_time_entries_by_weekday[
			
 
				-                            weekday
			
 
				-                        ].horizontal_lines.append(
			
 
				-                            HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
			
 
				+                                weekday_areas[weekday] = new_area
			
 
				+                        timeslot_y_levels[target_timeslot] = YLevel(
			
 
				+                            y1=cell[1], y2=cell[3]
			
 
				                         )
			
 
				+                        expected_timeslot_index += 1
			
 
				 
			
 
				-            all_modules: list[RawExtractedModule] = []
			
 
				-            for weekday in Weekday:
			
 
				-                all_modules.extend(
			
 
				-                    get_modules_from_weekday(
			
 
				-                        weekday,
			
 
				-                        unmerged_time_entries_by_weekday[weekday],
			
 
				-                        page,
			
 
				-                        timeslot_y_levels,
			
 
				-                        page_index + 1,
			
 
				-                    )
			
 
				+        collected_unmerged_time_entries_by_weekday(
			
 
				+            unmerged_time_entries_by_weekday, weekday_areas, table, page
			
 
				+        )
			
 
				+
			
 
				+        all_modules: list[RawExtractedModule] = []
			
 
				+        for weekday in Weekday:
			
 
				+            all_modules.extend(
			
 
				+                get_modules_from_weekday(
			
 
				+                    weekday,
			
 
				+                    unmerged_time_entries_by_weekday[weekday],
			
 
				+                    page,
			
 
				+                    timeslot_y_levels,
			
 
				+                    page_index + 1,
			
 
				                 )
			
 
				-            page_metadata = parse_above_table_text(
			
 
				-                text_above_table, previous_page_metadata
			
 
				             )
			
 
				-            previous_page_metadata.append(page_metadata)
			
 
				-            extraction_data.append(
			
 
				-                ClassPdfExtractionPageData(
			
 
				-                    raw_extracted_modules=all_modules, page_metadata=page_metadata
			
 
				-                )
			
 
				+        return RawClassPdfExtractionPageData(
			
 
				+            raw_extracted_modules=all_modules, above_table_text=text_above_table
			
 
				+        )
			
 
				+
			
 
				+def select_main_table(page: Page, page_index: int) -> Table:
			
 
				+    """
			
 
				+    Selects the main table on the PDF Page. This should be the timetable.
			
 
				+    """
			
 
				+    found_tables = page.find_tables(CLASS_TIMETABLE_PDF_TABLE_SETTINGS)
			
 
				+    logging.debug(
			
 
				+        "amount of tables found on page %d: %d",
			
 
				+        page_index + 1,
			
 
				+        len(found_tables),
			
 
				+    )
			
 
				+    table = found_tables[get_usable_table_index(found_tables)]
			
 
				+    return table
			
 
				+
			
 
				+
			
 
				+def collected_unmerged_time_entries_by_weekday(
			
 
				+    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries],
			
 
				+    weekday_areas: dict[Weekday, Area],
			
 
				+    table: Table,
			
 
				+    page: Page,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Populates the passed unmerged_time_entries_by_weekday dict with the
			
 
				+    `UnmergedTimeEntries` by `Weekday`.
			
 
				+    """
			
 
				+    for weekday in Weekday:
			
 
				+        unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
			
 
				+            cells=[], horizontal_lines=[]
			
 
				+        )
			
 
				+        target_area: Area = weekday_areas[weekday]
			
 
				+        logging.debug("target_area: %s", target_area)
			
 
				+
			
 
				+        for row_index, row in enumerate(table.rows):
			
 
				+            for column_index, cell in enumerate(row.cells):
			
 
				+                if cell is None:
			
 
				+                    logging.debug("None table cell found")
			
 
				+                    continue
			
 
				+                logging.debug("row: %d, col: %d", row_index, column_index)
			
 
				+                logging.debug("cell: %s", cell)
			
 
				+                if (
			
 
				+                    target_area.x1 <= cell[0]
			
 
				+                    and target_area.y1 <= cell[1]
			
 
				+                    and target_area.x2 >= cell[2]
			
 
				+                    and target_area.y2 >= cell[3]
			
 
				+                ):
			
 
				+                    unmerged_time_entries_by_weekday[weekday].cells.append(
			
 
				+                        Area(x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3])
			
 
				+                    )
			
 
				+                    logging.debug("%s cell found", weekday)
			
 
				+
			
 
				+        collect_horizontal_lines(
			
 
				+            unmerged_time_entries_by_weekday, page, target_area, weekday
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def collect_horizontal_lines(
			
 
				+    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries],
			
 
				+    page: Page,
			
 
				+    target_area: Area,
			
 
				+    weekday: Weekday,
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Populates the passed unmerged_time_entries_by_weekday dict with the
			
 
				+    `horizontal_lines` of the `UnmergedTimeEntries` by the passed weekday.
			
 
				+    These horizontal Lines are timeslot seperator lines.
			
 
				+    """
			
 
				+    for line_found in page.lines:
			
 
				+        line_x1 = line_found["x0"]
			
 
				+        line_x2 = line_found["x1"]
			
 
				+        line_y1 = line_found["y0"]
			
 
				+        line_y2 = line_found["y1"]
			
 
				+        line_bottom = line_found["bottom"]
			
 
				+
			
 
				+        # ignore non horizontal lines
			
 
				+        if line_y1 != line_y2:
			
 
				+            continue
			
 
				+
			
 
				+        if target_area.x1 <= line_x1 and target_area.x2 >= line_x2:
			
 
				+            logging.debug("%s timeslot seperator line found", weekday)
			
 
				+            unmerged_time_entries_by_weekday[weekday].horizontal_lines.append(
			
 
				+                HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
			
 
				             )
			
 
				-        return extraction_data
			
 
				+
			
 
				+
			
 
				+def extract_data_from_class_pdf(
			
 
				+    input_filename: Path, num_of_jobs: int = 1
			
 
				+) -> list[ClassPdfExtractionPageData]:
			
 
				+    """
			
 
				+    Extracts all data from the specified Class Timetable PDF filename.
			
 
				+    Can run via multiple jobs.
			
 
				+    """
			
 
				+    logging.info("Starting extraction with %d jobs", num_of_jobs)
			
 
				+
			
 
				+    num_pages: int = get_number_of_pdf_pages(input_filename)
			
 
				+    logging.info("Found %d pages to process", num_pages)
			
 
				+
			
 
				+    processed_pages: list[RawClassPdfExtractionPageData] = process_pages_in_parallel(
			
 
				+        num_of_jobs, input_filename, num_pages
			
 
				+    )
			
 
				+
			
 
				+    extraction_data: list[ClassPdfExtractionPageData] = process_metadata_sequentially(
			
 
				+        processed_pages
			
 
				+    )
			
 
				+
			
 
				+    return extraction_data
			
 
				+
			
 
				+
			
 
				+def process_metadata_sequentially(
			
 
				+    processed_pages: list[RawClassPdfExtractionPageData],
			
 
				+) -> list[ClassPdfExtractionPageData]:
			
 
				+    """
			
 
				+    Process the above table text into `PageMetadata`'s of the processed pages.
			
 
				+    """
			
 
				+    extraction_data: list[ClassPdfExtractionPageData] = []
			
 
				+    previous_page_metadata: list[PageMetadata] = []
			
 
				+
			
 
				+    for processed_page in processed_pages:
			
 
				+        page_metadata = parse_above_table_text(
			
 
				+            processed_page.above_table_text, previous_page_metadata
			
 
				+        )
			
 
				+        previous_page_metadata.append(page_metadata)
			
 
				+        extraction_data.append(
			
 
				+            ClassPdfExtractionPageData(
			
 
				+                raw_extracted_modules=processed_page.raw_extracted_modules,
			
 
				+                page_metadata=page_metadata,
			
 
				+            )
			
 
				+        )
			
 
				+    return extraction_data
			
 
				+
			
 
				+
			
 
				+def process_pages_in_parallel(
			
 
				+    num_of_jobs: int, input_filename: Path, num_of_pages: int
			
 
				+) -> list[RawClassPdfExtractionPageData]:
			
 
				+    """Extracts the pdf pages in parallel based on the number of jobs"""
			
 
				+    with Pool(processes=num_of_jobs) as pool:
			
 
				+        results = pool.starmap(
			
 
				+            process_page, [(input_filename, i) for i in range(num_of_pages)]
			
 
				+        )
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def get_number_of_pdf_pages(input_filename: Path) -> int:
			
 
				+    """Get the number of pdf pages using the pdfplumber library"""
			
 
				+    with pdfplumber.open(input_filename) as pdf:
			
 
				+        num_pages = len(pdf.pages)
			
 
				+    return num_pages
			
 
				 
			
 
				 
			
 
				 def get_above_table_text(page: Page, table_y1: float) -> str: