extract_lecturer_shorthands_pdf.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. #!/usr/bin/env python3
  2. import logging
  3. from argparse import ArgumentParser
  4. import pdfplumber
  5. from pdfplumber.table import Table
  6. from pydantic import TypeAdapter
  7. from config import (
  8. LECTURER_SHORTHAND_PDF_TABLE_SETTINGS,
  9. LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE,
  10. LECTURER_SHORTHAND_JSON_OUTPUT_FILE,
  11. )
  12. from parse import RawLecturer, Lecturer
  13. def extract_rows_from_lecturer_shorthand_pdf(input_file) -> list[RawLecturer]:
  14. lecturers: list[RawLecturer] = []
  15. with pdfplumber.open(input_file) as pdf:
  16. # find the X coordinates of "Nachname" and "Vorname" on the first page
  17. # to use as fixed separators for all rows. This assumes they do not
  18. # deviate their x values on subsequent pages.
  19. first_page = pdf.pages[0]
  20. nachname_rects = first_page.search("Nachname")
  21. vorname_rects = first_page.search("Vorname")
  22. sep_x_1 = 0
  23. sep_x_2 = 0
  24. if nachname_rects and vorname_rects:
  25. # Subtract 2 pixels to ensure the start of the letter is caught
  26. # even if it drifts slightly left.
  27. sep_x_1 = nachname_rects[0]["x0"] - 2
  28. sep_x_2 = vorname_rects[0]["x0"] - 2
  29. logging.debug(
  30. "calculated separators: %d (Nachname), %d (Vorname)", sep_x_1, sep_x_2
  31. )
  32. else:
  33. raise RuntimeError("Could not find headers for separator calculation")
  34. lines_y1: list = []
  35. min_line_y1 = 0
  36. max_line_y1 = 0
  37. for page_index, page in enumerate(pdf.pages):
  38. # Remove top header and bottom footer based on first / last line.
  39. # Assumes the header and footer positions do not go beyond these
  40. # values on subsequent pages.
  41. if page_index == 0:
  42. for line in page.lines:
  43. lines_y1.append(line.get("y1"))
  44. if lines_y1:
  45. min_line_y1 = min(lines_y1)
  46. max_line_y1 = max(lines_y1)
  47. # guard against empty lines list if page has no lines
  48. if not lines_y1:
  49. logging.warning("First page has no lines")
  50. crop_box = (0, 0, page.width, page.height)
  51. else:
  52. crop_box = (0, min_line_y1, page.width, max_line_y1)
  53. cropped_page = page.crop(crop_box)
  54. found_tables: list[Table] = cropped_page.find_tables(
  55. LECTURER_SHORTHAND_PDF_TABLE_SETTINGS
  56. )
  57. if len(found_tables) != 1:
  58. raise RuntimeError(
  59. "Did not find exactly 1 table in the lecuturer shorthands pdf"
  60. + f" on page {page_index + 1}"
  61. )
  62. table: Table = found_tables[0]
  63. for row_index, row in enumerate(table.rows):
  64. if row is None:
  65. logging.debug("None table row found")
  66. continue
  67. valid_cells = [cell for cell in row.cells if cell is not None]
  68. if not valid_cells:
  69. continue
  70. row_top = valid_cells[0][1]
  71. row_bottom = valid_cells[0][3]
  72. row_left = valid_cells[0][0]
  73. row_right = valid_cells[-1][2]
  74. row_bbox = (row_left, row_top, row_right, row_bottom)
  75. logging.debug("row %d dimensions: %s", row_index, row_bbox)
  76. # column 1: From start of row -> Nachname separator
  77. col1_bbox = (row_left, row_top, sep_x_1, row_bottom)
  78. # column 2: From Nachname separator -> Vorname separator
  79. col2_bbox = (sep_x_1, row_top, sep_x_2, row_bottom)
  80. # column 3: From Vorname separator -> End of row
  81. col3_bbox = (sep_x_2, row_top, row_right, row_bottom)
  82. logging.debug("col 1 bbox: %s", col1_bbox)
  83. logging.debug("col 2 bbox: %s", col2_bbox)
  84. logging.debug("col 3 bbox: %s", col3_bbox)
  85. row_text: str = cropped_page.crop(row_bbox).extract_text()
  86. logging.debug("row text: %s", row_text)
  87. col1_text = cropped_page.crop(col1_bbox).extract_text()
  88. logging.debug("col 1 text: %s", col1_text)
  89. col2_text = cropped_page.crop(col2_bbox).extract_text()
  90. logging.debug("col 2 text: %s", col2_text)
  91. col3_text = cropped_page.crop(col3_bbox).extract_text()
  92. logging.debug("col 3 text: %s", col3_text)
  93. lecturers.append(RawLecturer(col1_text, col3_text, col2_text))
  94. return lecturers
  95. def is_table_header_row(raw_lecturer: RawLecturer) -> bool:
  96. return (
  97. raw_lecturer.shorthand == "Name"
  98. and raw_lecturer.surname == "Nachname"
  99. and raw_lecturer.firstname == "Vorname"
  100. )
  101. def is_vak_example_row(raw_lecturer):
  102. return (
  103. raw_lecturer.shorthand == "vak"
  104. and raw_lecturer.surname == ""
  105. and raw_lecturer.firstname == ""
  106. )
  107. def get_lecturers_json(modules: list[Lecturer]) -> str:
  108. """
  109. Serializes a list of Lecturer objects into a formatted JSON string.
  110. """
  111. adapter = TypeAdapter(list[Lecturer])
  112. return adapter.dump_json(modules).decode("utf-8")
  113. def parse_lecturers(raw_lecturers: list[RawLecturer]) -> list[Lecturer]:
  114. lecturers: list[Lecturer] = []
  115. for raw_lecturer in raw_lecturers:
  116. if is_table_header_row(raw_lecturer) or is_vak_example_row(raw_lecturer):
  117. logging.debug("skipping raw lecturer: %s", raw_lecturer)
  118. else:
  119. new_lecturer: Lecturer = Lecturer(
  120. short=raw_lecturer.shorthand,
  121. surname=raw_lecturer.surname,
  122. firstname=raw_lecturer.firstname,
  123. )
  124. if new_lecturer in lecturers:
  125. logging.debug("skipped over duplicate lecturer: %s", new_lecturer)
  126. else:
  127. lecturers.append(new_lecturer)
  128. return lecturers
  129. def main() -> None:
  130. parser = ArgumentParser(description="Parse lecturer shorthand PDF to JSON.")
  131. parser.add_argument(
  132. "-i",
  133. "--input",
  134. help="Path to the input PDF file",
  135. default=LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE,
  136. )
  137. parser.add_argument(
  138. "-o",
  139. "--output",
  140. help="Path to the output JSON file",
  141. default=LECTURER_SHORTHAND_JSON_OUTPUT_FILE,
  142. )
  143. args = parser.parse_args()
  144. logging.basicConfig(level=logging.INFO)
  145. raw_lecturers: list[RawLecturer] = extract_rows_from_lecturer_shorthand_pdf(
  146. args.input
  147. )
  148. lecturers: list[Lecturer] = parse_lecturers(raw_lecturers)
  149. json_output: str = get_lecturers_json(lecturers)
  150. with open(args.output, "w", encoding="utf-8") as f:
  151. f.write(json_output)
  152. if __name__ == "__main__":
  153. main()