noah
/
modulplaner-backend


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
							#!/usr/bin/env python3

import logging
from argparse import ArgumentParser

import pdfplumber
from pdfplumber.table import Table
from pydantic import TypeAdapter

from config import (
    LECTURER_SHORTHAND_PDF_TABLE_SETTINGS,
    LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE,
    LECTURER_SHORTHAND_JSON_OUTPUT_FILE,
)
from parse import RawLecturer, Lecturer

logger = logging.getLogger("modulplaner-backend.extract_lecturer_shorthands_pdf")


def extract_rows_from_lecturer_shorthand_pdf(input_file) -> list[RawLecturer]:
    lecturers: list[RawLecturer] = []

    with pdfplumber.open(input_file) as pdf:
        # find the X coordinates of "Nachname" and "Vorname" on the first page
        # to use as fixed separators for all rows. This assumes they do not
        # deviate their x values on subsequent pages.
        first_page = pdf.pages[0]
        nachname_rects = first_page.search("Nachname")
        vorname_rects = first_page.search("Vorname")

        sep_x_1 = 0
        sep_x_2 = 0

        if nachname_rects and vorname_rects:
            # Subtract 2 pixels to ensure the start of the letter is caught
            # even if it drifts slightly left.
            sep_x_1 = nachname_rects[0]["x0"] - 2
            sep_x_2 = vorname_rects[0]["x0"] - 2
            logger.debug(
                "calculated separators: %d (Nachname), %d (Vorname)", sep_x_1, sep_x_2
            )
        else:
            raise RuntimeError("Could not find headers for separator calculation")

        lines_y1: list = []
        min_line_y1 = 0
        max_line_y1 = 0

        for page_index, page in enumerate(pdf.pages):
            # Remove top header and bottom footer based on first / last line.
            # Assumes the header and footer positions do not go beyond these
            # values on subsequent pages.
            if page_index == 0:
                for line in page.lines:
                    lines_y1.append(line.get("y1"))
                if lines_y1:
                    min_line_y1 = min(lines_y1)
                    max_line_y1 = max(lines_y1)

            # guard against empty lines list if page has no lines
            if not lines_y1:
                logger.warning("First page has no lines")
                crop_box = (0, 0, page.width, page.height)
            else:
                crop_box = (0, min_line_y1, page.width, max_line_y1)

            cropped_page = page.crop(crop_box)

            found_tables: list[Table] = cropped_page.find_tables(
                LECTURER_SHORTHAND_PDF_TABLE_SETTINGS
            )

            if len(found_tables) != 1:
                raise RuntimeError(
                    "Did not find exactly 1 table in the Lecturer Shorthands PDF"
                    + f" on page {page_index + 1}"
                )

            table: Table = found_tables[0]

            for row_index, row in enumerate(table.rows):
                if row is None:
                    logger.debug("None table row found")
                    continue

                valid_cells = [cell for cell in row.cells if cell is not None]

                if not valid_cells:
                    continue

                row_top = valid_cells[0][1]
                row_bottom = valid_cells[0][3]
                row_left = valid_cells[0][0]
                row_right = valid_cells[-1][2]

                row_bbox = (row_left, row_top, row_right, row_bottom)

                logger.debug("row %d dimensions: %s", row_index, row_bbox)

                # column 1: From start of row -> Nachname separator
                col1_bbox = (row_left, row_top, sep_x_1, row_bottom)
                # column 2: From Nachname separator -> Vorname separator
                col2_bbox = (sep_x_1, row_top, sep_x_2, row_bottom)
                # column 3: From Vorname separator -> End of row
                col3_bbox = (sep_x_2, row_top, row_right, row_bottom)

                logger.debug("col 1 bbox: %s", col1_bbox)
                logger.debug("col 2 bbox: %s", col2_bbox)
                logger.debug("col 3 bbox: %s", col3_bbox)

                row_text: str = cropped_page.crop(row_bbox).extract_text()
                logger.debug("row text: %s", row_text)
                col1_text = cropped_page.crop(col1_bbox).extract_text()
                logger.debug("col 1 text: %s", col1_text)
                col2_text = cropped_page.crop(col2_bbox).extract_text()
                logger.debug("col 2 text: %s", col2_text)
                col3_text = cropped_page.crop(col3_bbox).extract_text()
                logger.debug("col 3 text: %s", col3_text)
                lecturers.append(
                    RawLecturer(
                        shorthand=col1_text, firstname=col3_text, surname=col2_text
                    )
                )

    return lecturers


def is_table_header_row(raw_lecturer: RawLecturer) -> bool:
    return (
        raw_lecturer.shorthand == "Name"
        and raw_lecturer.surname == "Nachname"
        and raw_lecturer.firstname == "Vorname"
    )


def is_vak_example_row(raw_lecturer):
    return (
        raw_lecturer.shorthand == "vak"
        and raw_lecturer.surname == ""
        and raw_lecturer.firstname == ""
    )


def get_lecturers_json(modules: list[Lecturer]) -> str:
    """
    Serializes a list of Lecturer objects into a formatted JSON string.
    """
    adapter = TypeAdapter(list[Lecturer])
    return adapter.dump_json(modules).decode("utf-8")


def parse_lecturers(raw_lecturers: list[RawLecturer]) -> list[Lecturer]:
    lecturers: list[Lecturer] = []
    for raw_lecturer in raw_lecturers:
        if is_table_header_row(raw_lecturer) or is_vak_example_row(raw_lecturer):
            logger.debug("skipping raw lecturer: %s", raw_lecturer)
        else:
            new_lecturer: Lecturer = Lecturer(
                short=raw_lecturer.shorthand,
                surname=raw_lecturer.surname,
                firstname=raw_lecturer.firstname,
            )
            if new_lecturer in lecturers:
                logger.debug("skipped over duplicate lecturer: %s", new_lecturer)
            else:
                lecturers.append(new_lecturer)
    return lecturers


def main() -> None:
    parser = ArgumentParser(description="Parse Lecturer Shorthands PDF to JSON.")
    parser.add_argument(
        "-i",
        "--input",
        help="Path to the input Lecturer Shorthands PDF file",
        default=LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE,
    )
    parser.add_argument(
        "-o",
        "--output",
        help="Path to the output JSON file",
        default=LECTURER_SHORTHAND_JSON_OUTPUT_FILE,
    )
    parser.add_argument(
        "--log-level",
        help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
        default="INFO",
        type=str.upper,
        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
    )
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    raw_lecturers: list[RawLecturer] = extract_rows_from_lecturer_shorthand_pdf(
        args.input
    )
    lecturers: list[Lecturer] = parse_lecturers(raw_lecturers)
    json_output: str = get_lecturers_json(lecturers)

    with open(args.output, "w", encoding="utf-8") as f:
        f.write(json_output)


if __name__ == "__main__":
    main()