extract_lecturer_shorthands_pdf.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. #!/usr/bin/env python3
  2. import logging
  3. from argparse import ArgumentParser
  4. import pdfplumber
  5. from pdfplumber.table import Table
  6. from pydantic import TypeAdapter
  7. from config import (
  8. LECTURER_SHORTHAND_PDF_TABLE_SETTINGS,
  9. LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE,
  10. LECTURER_SHORTHAND_JSON_OUTPUT_FILE,
  11. )
  12. from parse import RawLecturer, Lecturer
  13. logger = logging.getLogger("modulplaner-backend.extract_lecturer_shorthands_pdf")
  14. def extract_rows_from_lecturer_shorthand_pdf(input_file) -> list[RawLecturer]:
  15. lecturers: list[RawLecturer] = []
  16. with pdfplumber.open(input_file) as pdf:
  17. # find the X coordinates of "Nachname" and "Vorname" on the first page
  18. # to use as fixed separators for all rows. This assumes they do not
  19. # deviate their x values on subsequent pages.
  20. first_page = pdf.pages[0]
  21. nachname_rects = first_page.search("Nachname")
  22. vorname_rects = first_page.search("Vorname")
  23. sep_x_1 = 0
  24. sep_x_2 = 0
  25. if nachname_rects and vorname_rects:
  26. # Subtract 2 pixels to ensure the start of the letter is caught
  27. # even if it drifts slightly left.
  28. sep_x_1 = nachname_rects[0]["x0"] - 2
  29. sep_x_2 = vorname_rects[0]["x0"] - 2
  30. logger.debug(
  31. "calculated separators: %d (Nachname), %d (Vorname)", sep_x_1, sep_x_2
  32. )
  33. else:
  34. raise RuntimeError("Could not find headers for separator calculation")
  35. lines_y1: list = []
  36. min_line_y1 = 0
  37. max_line_y1 = 0
  38. for page_index, page in enumerate(pdf.pages):
  39. # Remove top header and bottom footer based on first / last line.
  40. # Assumes the header and footer positions do not go beyond these
  41. # values on subsequent pages.
  42. if page_index == 0:
  43. for line in page.lines:
  44. lines_y1.append(line.get("y1"))
  45. if lines_y1:
  46. min_line_y1 = min(lines_y1)
  47. max_line_y1 = max(lines_y1)
  48. # guard against empty lines list if page has no lines
  49. if not lines_y1:
  50. logger.warning("First page has no lines")
  51. crop_box = (0, 0, page.width, page.height)
  52. else:
  53. crop_box = (0, min_line_y1, page.width, max_line_y1)
  54. cropped_page = page.crop(crop_box)
  55. found_tables: list[Table] = cropped_page.find_tables(
  56. LECTURER_SHORTHAND_PDF_TABLE_SETTINGS
  57. )
  58. if len(found_tables) != 1:
  59. raise RuntimeError(
  60. "Did not find exactly 1 table in the Lecturer Shorthands PDF"
  61. + f" on page {page_index + 1}"
  62. )
  63. table: Table = found_tables[0]
  64. for row_index, row in enumerate(table.rows):
  65. if row is None:
  66. logger.debug("None table row found")
  67. continue
  68. valid_cells = [cell for cell in row.cells if cell is not None]
  69. if not valid_cells:
  70. continue
  71. row_top = valid_cells[0][1]
  72. row_bottom = valid_cells[0][3]
  73. row_left = valid_cells[0][0]
  74. row_right = valid_cells[-1][2]
  75. row_bbox = (row_left, row_top, row_right, row_bottom)
  76. logger.debug("row %d dimensions: %s", row_index, row_bbox)
  77. # column 1: From start of row -> Nachname separator
  78. col1_bbox = (row_left, row_top, sep_x_1, row_bottom)
  79. # column 2: From Nachname separator -> Vorname separator
  80. col2_bbox = (sep_x_1, row_top, sep_x_2, row_bottom)
  81. # column 3: From Vorname separator -> End of row
  82. col3_bbox = (sep_x_2, row_top, row_right, row_bottom)
  83. logger.debug("col 1 bbox: %s", col1_bbox)
  84. logger.debug("col 2 bbox: %s", col2_bbox)
  85. logger.debug("col 3 bbox: %s", col3_bbox)
  86. row_text: str = cropped_page.crop(row_bbox).extract_text()
  87. logger.debug("row text: %s", row_text)
  88. col1_text = cropped_page.crop(col1_bbox).extract_text()
  89. logger.debug("col 1 text: %s", col1_text)
  90. col2_text = cropped_page.crop(col2_bbox).extract_text()
  91. logger.debug("col 2 text: %s", col2_text)
  92. col3_text = cropped_page.crop(col3_bbox).extract_text()
  93. logger.debug("col 3 text: %s", col3_text)
  94. lecturers.append(
  95. RawLecturer(
  96. shorthand=col1_text, firstname=col3_text, surname=col2_text
  97. )
  98. )
  99. return lecturers
  100. def is_table_header_row(raw_lecturer: RawLecturer) -> bool:
  101. return (
  102. raw_lecturer.shorthand == "Name"
  103. and raw_lecturer.surname == "Nachname"
  104. and raw_lecturer.firstname == "Vorname"
  105. )
  106. def is_vak_example_row(raw_lecturer):
  107. return (
  108. raw_lecturer.shorthand == "vak"
  109. and raw_lecturer.surname == ""
  110. and raw_lecturer.firstname == ""
  111. )
  112. def get_lecturers_json(modules: list[Lecturer]) -> str:
  113. """
  114. Serializes a list of Lecturer objects into a formatted JSON string.
  115. """
  116. adapter = TypeAdapter(list[Lecturer])
  117. return adapter.dump_json(modules).decode("utf-8")
  118. def parse_lecturers(raw_lecturers: list[RawLecturer]) -> list[Lecturer]:
  119. lecturers: list[Lecturer] = []
  120. for raw_lecturer in raw_lecturers:
  121. if is_table_header_row(raw_lecturer) or is_vak_example_row(raw_lecturer):
  122. logger.debug("skipping raw lecturer: %s", raw_lecturer)
  123. else:
  124. new_lecturer: Lecturer = Lecturer(
  125. short=raw_lecturer.shorthand,
  126. surname=raw_lecturer.surname,
  127. firstname=raw_lecturer.firstname,
  128. )
  129. if new_lecturer in lecturers:
  130. logger.debug("skipped over duplicate lecturer: %s", new_lecturer)
  131. else:
  132. lecturers.append(new_lecturer)
  133. return lecturers
  134. def main() -> None:
  135. parser = ArgumentParser(description="Parse Lecturer Shorthands PDF to JSON.")
  136. parser.add_argument(
  137. "-i",
  138. "--input",
  139. help="Path to the input Lecturer Shorthands PDF file",
  140. default=LECTURER_SHORTHAND_PDF_PDF_INPUT_FILE,
  141. )
  142. parser.add_argument(
  143. "-o",
  144. "--output",
  145. help="Path to the output JSON file",
  146. default=LECTURER_SHORTHAND_JSON_OUTPUT_FILE,
  147. )
  148. parser.add_argument(
  149. "--log-level",
  150. help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
  151. default="INFO",
  152. type=str.upper,
  153. choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
  154. )
  155. args = parser.parse_args()
  156. logging.basicConfig(level=args.log_level)
  157. raw_lecturers: list[RawLecturer] = extract_rows_from_lecturer_shorthand_pdf(
  158. args.input
  159. )
  160. lecturers: list[Lecturer] = parse_lecturers(raw_lecturers)
  161. json_output: str = get_lecturers_json(lecturers)
  162. with open(args.output, "w", encoding="utf-8") as f:
  163. f.write(json_output)
  164. if __name__ == "__main__":
  165. main()