#!/usr/bin/env python3 import logging from argparse import ArgumentParser import pdfplumber from pdfplumber.table import Table from pydantic import TypeAdapter from config import ( LECTURER_SHORTHAND_PDF_TABLE_SETTINGS, LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE, LECTURER_SHORTHAND_JSON_OUTPUT_FILE, ) from parse import RawLecturer, Lecturer def extract_rows_from_lecturer_shorthand_pdf(input_file) -> list[RawLecturer]: lecturers: list[RawLecturer] = [] with pdfplumber.open(input_file) as pdf: # find the X coordinates of "Nachname" and "Vorname" on the first page # to use as fixed separators for all rows. This assumes they do not # deviate their x values on subsequent pages. first_page = pdf.pages[0] nachname_rects = first_page.search("Nachname") vorname_rects = first_page.search("Vorname") sep_x_1 = 0 sep_x_2 = 0 if nachname_rects and vorname_rects: # Subtract 2 pixels to ensure the start of the letter is caught # even if it drifts slightly left. sep_x_1 = nachname_rects[0]["x0"] - 2 sep_x_2 = vorname_rects[0]["x0"] - 2 logging.debug( "calculated separators: %d (Nachname), %d (Vorname)", sep_x_1, sep_x_2 ) else: raise RuntimeError("Could not find headers for separator calculation") lines_y1: list = [] min_line_y1 = 0 max_line_y1 = 0 for page_index, page in enumerate(pdf.pages): # Remove top header and bottom footer based on first / last line. # Assumes the header and footer positions do not go beyond these # values on subsequent pages. if page_index == 0: for line in page.lines: lines_y1.append(line.get("y1")) if lines_y1: min_line_y1 = min(lines_y1) max_line_y1 = max(lines_y1) # guard against empty lines list if page has no lines if not lines_y1: logging.warning("First page has no lines") crop_box = (0, 0, page.width, page.height) else: crop_box = (0, min_line_y1, page.width, max_line_y1) cropped_page = page.crop(crop_box) found_tables: list[Table] = cropped_page.find_tables( LECTURER_SHORTHAND_PDF_TABLE_SETTINGS ) if len(found_tables) != 1: raise RuntimeError( "Did not find exactly 1 table in the lecuturer shorthands pdf" + f" on page {page_index + 1}" ) table: Table = found_tables[0] for row_index, row in enumerate(table.rows): if row is None: logging.debug("None table row found") continue valid_cells = [cell for cell in row.cells if cell is not None] if not valid_cells: continue row_top = valid_cells[0][1] row_bottom = valid_cells[0][3] row_left = valid_cells[0][0] row_right = valid_cells[-1][2] row_bbox = (row_left, row_top, row_right, row_bottom) logging.debug("row %d dimensions: %s", row_index, row_bbox) # column 1: From start of row -> Nachname separator col1_bbox = (row_left, row_top, sep_x_1, row_bottom) # column 2: From Nachname separator -> Vorname separator col2_bbox = (sep_x_1, row_top, sep_x_2, row_bottom) # column 3: From Vorname separator -> End of row col3_bbox = (sep_x_2, row_top, row_right, row_bottom) logging.debug("col 1 bbox: %s", col1_bbox) logging.debug("col 2 bbox: %s", col2_bbox) logging.debug("col 3 bbox: %s", col3_bbox) row_text: str = cropped_page.crop(row_bbox).extract_text() logging.debug("row text: %s", row_text) col1_text = cropped_page.crop(col1_bbox).extract_text() logging.debug("col 1 text: %s", col1_text) col2_text = cropped_page.crop(col2_bbox).extract_text() logging.debug("col 2 text: %s", col2_text) col3_text = cropped_page.crop(col3_bbox).extract_text() logging.debug("col 3 text: %s", col3_text) lecturers.append(RawLecturer(col1_text, col3_text, col2_text)) return lecturers def is_table_header_row(raw_lecturer: RawLecturer) -> bool: return ( raw_lecturer.shorthand == "Name" and raw_lecturer.surname == "Nachname" and raw_lecturer.firstname == "Vorname" ) def is_vak_example_row(raw_lecturer): return ( raw_lecturer.shorthand == "vak" and raw_lecturer.surname == "" and raw_lecturer.firstname == "" ) def get_lecturers_json(modules: list[Lecturer]) -> str: """ Serializes a list of Lecturer objects into a formatted JSON string. """ adapter = TypeAdapter(list[Lecturer]) return adapter.dump_json(modules).decode("utf-8") def parse_lecturers(raw_lecturers: list[RawLecturer]) -> list[Lecturer]: lecturers: list[Lecturer] = [] for raw_lecturer in raw_lecturers: if is_table_header_row(raw_lecturer) or is_vak_example_row(raw_lecturer): logging.debug("skipping raw lecturer: %s", raw_lecturer) else: new_lecturer: Lecturer = Lecturer( short=raw_lecturer.shorthand, surname=raw_lecturer.surname, firstname=raw_lecturer.firstname, ) if new_lecturer in lecturers: logging.debug("skipped over duplicate lecturer: %s", new_lecturer) else: lecturers.append(new_lecturer) return lecturers def main() -> None: parser = ArgumentParser(description="Parse lecturer shorthand PDF to JSON.") parser.add_argument( "-i", "--input", help="Path to the input PDF file", default=LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE, ) parser.add_argument( "-o", "--output", help="Path to the output JSON file", default=LECTURER_SHORTHAND_JSON_OUTPUT_FILE, ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) raw_lecturers: list[RawLecturer] = extract_rows_from_lecturer_shorthand_pdf( args.input ) lecturers: list[Lecturer] = parse_lecturers(raw_lecturers) json_output: str = get_lecturers_json(lecturers) with open(args.output, "w", encoding="utf-8") as f: f.write(json_output) if __name__ == "__main__": main()