Forráskód Böngészése

add -j flag + refactoring the table extraction + rename parse_class_pdf.py to generate_classes_json.py

Noah Vogt 2 hónapja
szülő
commit
d627c392aa
5 módosított fájl, 304 hozzáadás és 188 törlés
  1. 5 13
      README.md
  2. 5 4
      config/constants.py
  3. 15 5
      generate_classes_json.py
  4. 4 0
      parse/models.py
  5. 275 166
      parse/table_extraction.py

+ 5 - 13
README.md

@@ -34,30 +34,22 @@ This project uses specific domain terms that map to the data sources and the gen
 
 This section is split into the different script provided by this repository.
 
-### parse_class_pdf.py
+### generate_classes_json.py
 
 Execute the following to parse a class timetable PDF into the `classes.json` file needed by the frontend.
 
 ```sh
-./parse_class_pdf.py [-h] [-l LECTURERS] [-i INPUT] [-o OUTPUT] [--save-intermediate SAVE_INTERMEDIATE] [--load-intermediate LOAD_INTERMEDIATE]
+./generate_classes_json.py -i klassen.pdf -o classes.json
 ```
 
-#### Arguments
-
-- `-i`, `--input`: Path to the input PDF file. Defaults to `klassen.pdf`.
-- `-o`, `--output`: Path to the output JSON file. Defaults to `classes.json`.
-- `-l`, `--lecturers`: Path to the `lecturers.json` file. If provided, it is used to validate lecturer shorthands during parsing.
-- `--save-intermediate`: Path to save the intermediate extraction data (JSON format) and exit. Useful for skipping the slow extraction stage in subsequent runs.
-- `--load-intermediate`: Path to load the intermediate extraction data from (JSON format) and skip extraction.
-
-The default values for input and output files are defined in `config/constants.py`.
+For more information, show the cli arguments via `./generate_classes_json.py -h`.
 
 #### Faster Development Cycle
 
 Since the PDF extraction takes a significant amount of time, you can split the process into two stages:
 
-1.  **Stage 1 (Extraction):** Run once and save the result: `./parse_class_pdf.py --save-intermediate data.pkl`
-2.  **Stage 2 (Parsing):** Load the saved data and iterate on the parsing logic: `./parse_class_pdf.py --load-intermediate data.pkl --output classes.json`
+1.  **Stage 1 (Extraction):** Run once and save the result: `./generate_classes_json.py --save-intermediate data.pkl`
+2.  **Stage 2 (Parsing):** Load the saved data and iterate on the parsing logic: `./generate_classes_json.py --load-intermediate data.pkl --output classes.json`
 
 ### extract_lecturer_shorthands_pdf.py
 

+ 5 - 4
config/constants.py

@@ -1,13 +1,14 @@
-CLASS_PDF_INPUT_FILE = "klassen.pdf"
-CLASSES_JSON_OUTPUT_FILE = "classes.json"
-CLASS_PDF_TABLE_SETTINGS = {
+CLASS_TIMETABLE_PDF_INPUT_FILE = "klassen.pdf"
+CLASS_TIMETABLE_PDF_TABLE_SETTINGS = {
     "vertical_strategy": "lines",
     "horizontal_strategy": "lines",
     "snap_tolerance": 3,
     "join_tolerance": 3,
     "edge_min_length": 3,
 }
-CLASS_PDF_MIN_DIMENSIONS = 150
+CLASS_TIMETABLE_PDF_MIN_DIMENSIONS = 150
+
+CLASSES_JSON_OUTPUT_FILE = "classes.json"
 
 TOLERANCE = 3
 REQUESTS_TIMEOUT = 4

+ 15 - 5
parse_class_pdf.py → generate_classes_json.py

@@ -15,7 +15,7 @@ from parse import (
     ClassJsonModule,
 )
 
-from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
+from config import CLASS_TIMETABLE_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
 
 
 def get_valid_lecturers(file_path: str) -> list[str]:
@@ -42,13 +42,16 @@ def get_valid_lecturers(file_path: str) -> list[str]:
 def main() -> None:
     parser = ArgumentParser(description="Parse Class Timetable PDF to JSON.")
     parser.add_argument(
-        "-l", "--lecturers", help="Path to the lecturers.json file", default=None
+        "-l",
+        "--lecturers",
+        help="Path to the lecturers.json file (Optional)",
+        default=None,
     )
     parser.add_argument(
         "-i",
         "--input",
         help="Path to the input Class Timetable PDF file",
-        default=CLASS_PDF_INPUT_FILE,
+        default=CLASS_TIMETABLE_PDF_INPUT_FILE,
     )
     parser.add_argument(
         "-o",
@@ -66,11 +69,18 @@ def main() -> None:
         help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
         default=None,
     )
+    parser.add_argument(
+        "-j",
+        "--jobs",
+        help="Number of parallel jobs to use for extraction (default: 1)",
+        type=int,
+        default=1,
+    )
 
     args = parser.parse_args()
     lecturers_file = args.lecturers
 
-    logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(level=logging.DEBUG)
 
     valid_lecturer_shorthands: list[str] | None = None
     if lecturers_file:
@@ -85,7 +95,7 @@ def main() -> None:
                 list[ClassPdfExtractionPageData]
             ).validate_json(f.read())
     else:
-        extraction_data = extract_data_from_class_pdf(args.input)
+        extraction_data = extract_data_from_class_pdf(args.input, num_of_jobs=args.jobs)
         if args.save_intermediate:
             logging.info("Saving intermediate data to %s", args.save_intermediate)
             with open(args.save_intermediate, "w", encoding="utf-8") as f:

+ 4 - 0
parse/models.py

@@ -194,6 +194,10 @@ class ClassPdfExtractionPageData(BaseModel):
     raw_extracted_modules: list[RawExtractedModule]
     page_metadata: PageMetadata
 
+class RawClassPdfExtractionPageData(BaseModel):
+    raw_extracted_modules: list[RawExtractedModule]
+    above_table_text: str
+
 
 class StartsWithMatch(BaseModel):
     shorthand_found: str

+ 275 - 166
parse/table_extraction.py

@@ -1,8 +1,16 @@
 import logging
+from multiprocessing import Pool
+from pathlib import Path
+
 from pdfplumber.page import Page
+from pdfplumber.table import Table
 import pdfplumber
 
-from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS, CLASS_PDF_MIN_DIMENSIONS
+from config import (
+    CLASS_TIMETABLE_PDF_TABLE_SETTINGS,
+    ALLOWED_TIMESLOTS,
+    CLASS_TIMETABLE_PDF_MIN_DIMENSIONS,
+)
 from .models import (
     Weekday,
     TimeSlot,
@@ -12,6 +20,7 @@ from .models import (
     Area,
     HorizontalLine,
     ClassPdfExtractionPageData,
+    RawClassPdfExtractionPageData,
     PageMetadata,
 )
 from .above_table_text import parse_above_table_text
@@ -50,17 +59,7 @@ def get_modules_from_weekday(
     """
     Extracts the modules (raw text and start/end) of a weekday on a single pdf page
     """
-    try:
-        highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
-    except KeyError:
-        logging.warning("Highest allowed timeslot was not found. Trying lower one's.")
-        for time_slot in allowed_time_slots[:-1]:
-            try:
-                highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
-            except KeyError:
-                continue
-            finally:
-                break
+    highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
     modules = []
     while len(unmerged_time_entries.cells) > 0:
         area = unmerged_time_entries.cells.pop(0)
@@ -123,176 +122,286 @@ def get_modules_from_weekday(
     return modules
 
 
-def extract_data_from_class_pdf(
-    input_filename: str, lecturers_file=None
-) -> list[ClassPdfExtractionPageData]:
+def get_usable_table_index(found_tables: list) -> int:
     """
-    Extracts all data from class timetable pdf's
+    Identifies the index of the timetable on the page based on dimensions.
+
+    Raises:
+        RuntimeError: If no or multiple tables matching the minimum dimensions are found.
+    """
+    if not found_tables:
+        raise RuntimeError("No matching tables found.")
+
+    valid_indices = []
+    for index, table in enumerate(found_tables):
+        x0, top, x1, bottom = table.bbox
+        width = x1 - x0
+        height = bottom - top
+        logging.debug(
+            "table num %d: width: %d, height: %d",
+            index + 1,
+            width,
+            height,
+        )
+        if (
+            width >= CLASS_TIMETABLE_PDF_MIN_DIMENSIONS
+            and height >= CLASS_TIMETABLE_PDF_MIN_DIMENSIONS
+        ):
+            valid_indices.append(index)
+
+    if len(valid_indices) > 1:
+        raise RuntimeError(
+            f"Found {len(valid_indices)} valid tables, expected at most 1. "
+            "Ambiguous table selection."
+        )
+
+    if len(valid_indices) == 1:
+        return valid_indices[0]
+
+    return 0
+
+
+def process_page(
+    input_filename: Path, page_index: int
+) -> RawClassPdfExtractionPageData:
+    """
+    Process a single page of the PDF to extract modules and header text.
+    Designed to be run in a separate process.
     """
-    extraction_data: list[ClassPdfExtractionPageData] = []
-    previous_page_metadata: list[PageMetadata] = []
-    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries] = {}
     with pdfplumber.open(input_filename) as pdf:
-        for page_index, page in enumerate(pdf.pages):
-            weekday_areas: dict[Weekday, Area] = {}
-            timeslot_y_levels: dict[TimeSlot, YLevel] = {}
-            for day in Weekday:
-                weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
-
-            found_tables = page.find_tables(CLASS_PDF_TABLE_SETTINGS)
-            logging.debug(
-                "amount of tables found on page %d: %d",
-                page_index + 1,
-                len(found_tables),
-            )
-            usable_table_index: int = 0
-            if len(found_tables) > 1:
-                num_of_tables_with_at_least_min_dimensions: int = 0
-                for table_index, table in enumerate(found_tables):
-                    x0, top, x1, bottom = table.bbox
-                    width = x1 - x0
-                    height = bottom - top
-                    logging.debug(
-                        "table num %d: width: %d, height: %d",
-                        table_index + 1,
-                        width,
-                        height,
-                    )
-                    if width >= CLASS_PDF_MIN_DIMENSIONS and height >= CLASS_PDF_MIN_DIMENSIONS:
-                        num_of_tables_with_at_least_min_dimensions += 1
-                        usable_table_index = table_index
-            table = found_tables[usable_table_index]
-            table_y1 = table.bbox[1]
-            text_above_table = get_above_table_text(page, table_y1)
-
-            empty_start_found = False
-
-            # get weekday and timeslot areas
-            expected_timeslot_index = 0
-            for row_index, row in enumerate(table.rows):
-                if row_index == 0:
-                    for column_index, cell in enumerate(row.cells):
-                        logging.debug("row: %d, col: %d", row_index, column_index)
-                        logging.debug(cell)
-                        if cell is None:
-                            logging.debug("None Table Cell Found")
-                        else:
-                            cell_text = page.crop(
-                                (cell[0], cell[1], cell[2], cell[3])
-                            ).extract_text()
-                            if not empty_start_found and len(cell_text) == 0:
-                                logging.debug("empty start found")
-                                empty_start_found = True
-
-                            weekday_enum = get_weekday_from_text(cell_text)
-                            if weekday_enum:
-                                logging.debug("Weekday %s found", cell_text)
-                                weekday_areas[weekday_enum] = Area(
-                                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
-                                )
-                else:
-                    logging.debug("row: %d, col: %d", row_index, 0)
-                    cell = row.cells[0]
+        page = pdf.pages[page_index]
+        weekday_areas: dict[Weekday, Area] = {}
+        timeslot_y_levels: dict[TimeSlot, YLevel] = {}
+        unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries] = {}
+
+        for day in Weekday:
+            weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)
+
+        table: Table = select_main_table(page, page_index)
+        text_above_table = get_above_table_text(page, table_y1=table.bbox[1])
+
+        empty_start_found = False
+
+        # get weekday and timeslot areas
+        expected_timeslot_index = 0
+        for row_index, row in enumerate(table.rows):
+            if row_index == 0:
+                for column_index, cell in enumerate(row.cells):
+                    logging.debug("row: %d, col: %d", row_index, column_index)
+                    logging.debug(cell)
                     if cell is None:
-                        logging.warning("Unexpected None Table Cell Found")
+                        logging.debug("None Table Cell Found")
                     else:
                         cell_text = page.crop(
                             (cell[0], cell[1], cell[2], cell[3])
                         ).extract_text()
-                        target_timeslot = allowed_time_slots[expected_timeslot_index]
-                        if not (
-                            target_timeslot.start_time in cell_text
-                            and target_timeslot.end_time in cell_text
-                        ):
-                            logging.warning(
-                                "Unexpected Timeslot found: '%s'", cell_text
-                            )
-                        else:
-                            # assumes this is the last timeslot ever
-                            if target_timeslot == TimeSlot(
-                                start_time="20:30", end_time="21:15"
-                            ):
-                                for weekday in Weekday:
-                                    new_area = Area(
-                                        x1=weekday_areas[weekday].x1,
-                                        y1=weekday_areas[weekday].y1,
-                                        x2=weekday_areas[weekday].x2,
-                                        y2=cell[3],
-                                    )
-                                    weekday_areas[weekday] = new_area
-                            timeslot_y_levels[target_timeslot] = YLevel(
-                                y1=cell[1], y2=cell[3]
-                            )
-                            expected_timeslot_index += 1
+                        if not empty_start_found and len(cell_text) == 0:
+                            logging.debug("empty start found")
+                            empty_start_found = True
 
-            for weekday in Weekday:
-                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
-                    cells=[], horizontal_lines=[]
-                )
-                target_area = weekday_areas[weekday]
-                logging.debug("target_area: %s", target_area)
-
-                for row_index, row in enumerate(table.rows):
-                    for column_index, cell in enumerate(row.cells):
-                        if cell is None:
-                            logging.debug("None table cell found")
-                            continue
-                        logging.debug("row: %d, col: %d", row_index, column_index)
-                        logging.debug("cell: %s", cell)
-                        if (
-                            target_area.x1 <= cell[0]
-                            and target_area.y1 <= cell[1]
-                            and target_area.x2 >= cell[2]
-                            and target_area.y2 >= cell[3]
+                        weekday_enum = get_weekday_from_text(cell_text)
+                        if weekday_enum:
+                            logging.debug("Weekday %s found", cell_text)
+                            weekday_areas[weekday_enum] = Area(
+                                x1=cell[0], y1=cell[3], x2=cell[2], y2=0
+                            )
+            else:
+                logging.debug("row: %d, col: %d", row_index, 0)
+                cell = row.cells[0]
+                if cell is None:
+                    logging.warning("Unexpected None Table Cell Found")
+                else:
+                    cell_text = page.crop(
+                        (cell[0], cell[1], cell[2], cell[3])
+                    ).extract_text()
+                    target_timeslot = allowed_time_slots[expected_timeslot_index]
+                    if not (
+                        target_timeslot.start_time in cell_text
+                        and target_timeslot.end_time in cell_text
+                    ):
+                        logging.warning("Unexpected Timeslot found: '%s'", cell_text)
+                    else:
+                        # assumes this is the last timeslot ever
+                        if target_timeslot == TimeSlot(
+                            start_time="20:30", end_time="21:15"
                         ):
-                            cell_dimensions = cell[0], cell[1], cell[2], cell[3]
-                            unmerged_time_entries_by_weekday[weekday].cells.append(
-                                Area(
-                                    x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3]
+                            for weekday in Weekday:
+                                new_area = Area(
+                                    x1=weekday_areas[weekday].x1,
+                                    y1=weekday_areas[weekday].y1,
+                                    x2=weekday_areas[weekday].x2,
+                                    y2=cell[3],
                                 )
-                            )
-                            logging.debug("%s cell found", weekday)
-
-                for line_found in page.lines:
-                    line_x1 = line_found["x0"]
-                    line_x2 = line_found["x1"]
-                    line_y1 = line_found["y0"]
-                    line_y2 = line_found["y1"]
-                    line_bottom = line_found["bottom"]
-
-                    # ignore non horizontal lines
-                    if line_y1 != line_y2:
-                        continue
-
-                    if target_area.x1 <= line_x1 and target_area.x2 >= line_x2:
-                        logging.debug("%s timeslot seperator line found", weekday)
-                        unmerged_time_entries_by_weekday[
-                            weekday
-                        ].horizontal_lines.append(
-                            HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
+                                weekday_areas[weekday] = new_area
+                        timeslot_y_levels[target_timeslot] = YLevel(
+                            y1=cell[1], y2=cell[3]
                         )
+                        expected_timeslot_index += 1
 
-            all_modules: list[RawExtractedModule] = []
-            for weekday in Weekday:
-                all_modules.extend(
-                    get_modules_from_weekday(
-                        weekday,
-                        unmerged_time_entries_by_weekday[weekday],
-                        page,
-                        timeslot_y_levels,
-                        page_index + 1,
-                    )
+        collected_unmerged_time_entries_by_weekday(
+            unmerged_time_entries_by_weekday, weekday_areas, table, page
+        )
+
+        all_modules: list[RawExtractedModule] = []
+        for weekday in Weekday:
+            all_modules.extend(
+                get_modules_from_weekday(
+                    weekday,
+                    unmerged_time_entries_by_weekday[weekday],
+                    page,
+                    timeslot_y_levels,
+                    page_index + 1,
                 )
-            page_metadata = parse_above_table_text(
-                text_above_table, previous_page_metadata
             )
-            previous_page_metadata.append(page_metadata)
-            extraction_data.append(
-                ClassPdfExtractionPageData(
-                    raw_extracted_modules=all_modules, page_metadata=page_metadata
-                )
+        return RawClassPdfExtractionPageData(
+            raw_extracted_modules=all_modules, above_table_text=text_above_table
+        )
+
+def select_main_table(page: Page, page_index: int) -> Table:
+    """
+    Selects the main table on the PDF Page. This should be the timetable.
+    """
+    found_tables = page.find_tables(CLASS_TIMETABLE_PDF_TABLE_SETTINGS)
+    logging.debug(
+        "amount of tables found on page %d: %d",
+        page_index + 1,
+        len(found_tables),
+    )
+    table = found_tables[get_usable_table_index(found_tables)]
+    return table
+
+
+def collected_unmerged_time_entries_by_weekday(
+    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries],
+    weekday_areas: dict[Weekday, Area],
+    table: Table,
+    page: Page,
+) -> None:
+    """
+    Populates the passed unmerged_time_entries_by_weekday dict with the
+    `UnmergedTimeEntries` by `Weekday`.
+    """
+    for weekday in Weekday:
+        unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
+            cells=[], horizontal_lines=[]
+        )
+        target_area: Area = weekday_areas[weekday]
+        logging.debug("target_area: %s", target_area)
+
+        for row_index, row in enumerate(table.rows):
+            for column_index, cell in enumerate(row.cells):
+                if cell is None:
+                    logging.debug("None table cell found")
+                    continue
+                logging.debug("row: %d, col: %d", row_index, column_index)
+                logging.debug("cell: %s", cell)
+                if (
+                    target_area.x1 <= cell[0]
+                    and target_area.y1 <= cell[1]
+                    and target_area.x2 >= cell[2]
+                    and target_area.y2 >= cell[3]
+                ):
+                    unmerged_time_entries_by_weekday[weekday].cells.append(
+                        Area(x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3])
+                    )
+                    logging.debug("%s cell found", weekday)
+
+        collect_horizontal_lines(
+            unmerged_time_entries_by_weekday, page, target_area, weekday
+        )
+
+
+def collect_horizontal_lines(
+    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries],
+    page: Page,
+    target_area: Area,
+    weekday: Weekday,
+) -> None:
+    """
+    Populates the passed unmerged_time_entries_by_weekday dict with the
+    `horizontal_lines` of the `UnmergedTimeEntries` by the passed weekday.
+    These horizontal Lines are timeslot seperator lines.
+    """
+    for line_found in page.lines:
+        line_x1 = line_found["x0"]
+        line_x2 = line_found["x1"]
+        line_y1 = line_found["y0"]
+        line_y2 = line_found["y1"]
+        line_bottom = line_found["bottom"]
+
+        # ignore non horizontal lines
+        if line_y1 != line_y2:
+            continue
+
+        if target_area.x1 <= line_x1 and target_area.x2 >= line_x2:
+            logging.debug("%s timeslot seperator line found", weekday)
+            unmerged_time_entries_by_weekday[weekday].horizontal_lines.append(
+                HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
             )
-        return extraction_data
+
+
+def extract_data_from_class_pdf(
+    input_filename: Path, num_of_jobs: int = 1
+) -> list[ClassPdfExtractionPageData]:
+    """
+    Extracts all data from the specified Class Timetable PDF filename.
+    Can run via multiple jobs.
+    """
+    logging.info("Starting extraction with %d jobs", num_of_jobs)
+
+    num_pages: int = get_number_of_pdf_pages(input_filename)
+    logging.info("Found %d pages to process", num_pages)
+
+    processed_pages: list[RawClassPdfExtractionPageData] = process_pages_in_parallel(
+        num_of_jobs, input_filename, num_pages
+    )
+
+    extraction_data: list[ClassPdfExtractionPageData] = process_metadata_sequentially(
+        processed_pages
+    )
+
+    return extraction_data
+
+
+def process_metadata_sequentially(
+    processed_pages: list[RawClassPdfExtractionPageData],
+) -> list[ClassPdfExtractionPageData]:
+    """
+    Process the above table text into `PageMetadata`'s of the processed pages.
+    """
+    extraction_data: list[ClassPdfExtractionPageData] = []
+    previous_page_metadata: list[PageMetadata] = []
+
+    for processed_page in processed_pages:
+        page_metadata = parse_above_table_text(
+            processed_page.above_table_text, previous_page_metadata
+        )
+        previous_page_metadata.append(page_metadata)
+        extraction_data.append(
+            ClassPdfExtractionPageData(
+                raw_extracted_modules=processed_page.raw_extracted_modules,
+                page_metadata=page_metadata,
+            )
+        )
+    return extraction_data
+
+
+def process_pages_in_parallel(
+    num_of_jobs: int, input_filename: Path, num_of_pages: int
+) -> list[RawClassPdfExtractionPageData]:
+    """Extracts the pdf pages in parallel based on the number of jobs"""
+    with Pool(processes=num_of_jobs) as pool:
+        results = pool.starmap(
+            process_page, [(input_filename, i) for i in range(num_of_pages)]
+        )
+    return results
+
+
+def get_number_of_pdf_pages(input_filename: Path) -> int:
+    """Get the number of pdf pages using the pdfplumber library"""
+    with pdfplumber.open(input_filename) as pdf:
+        num_pages = len(pdf.pages)
+    return num_pages
 
 
 def get_above_table_text(page: Page, table_y1: float) -> str: