parse_class_pdf.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python3
  2. import logging
  3. from argparse import ArgumentParser
  4. import json
  5. from pydantic import TypeAdapter
  6. from parse import (
  7. extract_data_from_class_pdf,
  8. get_modules_for_class_json,
  9. get_modules_json,
  10. get_classes,
  11. deduplicate_modules,
  12. ClassPdfExtractionPageData,
  13. ClassJsonModule,
  14. )
  15. from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
  16. def get_valid_lecturers(file_path: str) -> list[str]:
  17. """
  18. Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
  19. """
  20. valid_lecturers: list[str] = []
  21. try:
  22. logging.warning("reading lecturers file: '%s'", file_path)
  23. with open(file_path, "r", encoding="utf-8") as f:
  24. data = json.load(f)
  25. if isinstance(data, list):
  26. for entry in data:
  27. if isinstance(entry, dict) and "short" in entry:
  28. valid_lecturers.append(entry["short"])
  29. logging.info(
  30. "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
  31. )
  32. except Exception as e:
  33. logging.error("Failed to load valid lecturers from '%s': %s", file_path, e)
  34. return valid_lecturers
  35. def main() -> None:
  36. parser = ArgumentParser(description="Parse Class Timetable PDF to JSON.")
  37. parser.add_argument(
  38. "-l", "--lecturers", help="Path to the lecturers.json file", default=None
  39. )
  40. parser.add_argument(
  41. "-i",
  42. "--input",
  43. help="Path to the input Class Timetable PDF file",
  44. default=CLASS_PDF_INPUT_FILE,
  45. )
  46. parser.add_argument(
  47. "-o",
  48. "--output",
  49. help="Path to the output JSON file",
  50. default=CLASSES_JSON_OUTPUT_FILE,
  51. )
  52. parser.add_argument(
  53. "--save-intermediate",
  54. help="Path to save the intermediate extraction data (JSON format) and exit",
  55. default=None,
  56. )
  57. parser.add_argument(
  58. "--load-intermediate",
  59. help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
  60. default=None,
  61. )
  62. args = parser.parse_args()
  63. lecturers_file = args.lecturers
  64. logging.basicConfig(level=logging.INFO)
  65. valid_lecturer_shorthands: list[str] | None = None
  66. if lecturers_file:
  67. valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
  68. extraction_data: list[ClassPdfExtractionPageData]
  69. if args.load_intermediate:
  70. logging.info("Loading intermediate data from %s", args.load_intermediate)
  71. with open(args.load_intermediate, "r", encoding="utf-8") as f:
  72. extraction_data = TypeAdapter(
  73. list[ClassPdfExtractionPageData]
  74. ).validate_json(f.read())
  75. else:
  76. extraction_data = extract_data_from_class_pdf(args.input)
  77. if args.save_intermediate:
  78. logging.info("Saving intermediate data to %s", args.save_intermediate)
  79. with open(args.save_intermediate, "w", encoding="utf-8") as f:
  80. f.write(
  81. TypeAdapter(list[ClassPdfExtractionPageData])
  82. .dump_json(extraction_data)
  83. .decode("utf-8")
  84. )
  85. return
  86. parsed_modules: list[ClassJsonModule] = [
  87. module
  88. for data in extraction_data
  89. for module in get_modules_for_class_json(
  90. data.raw_extracted_modules,
  91. data.page_metadata.class_name,
  92. data.page_metadata.degree_program,
  93. get_classes(extraction_data),
  94. valid_lecturer_shorthands,
  95. )
  96. ]
  97. parsed_modules = deduplicate_modules(parsed_modules)
  98. json_output: str = get_modules_json(parsed_modules)
  99. with open(args.output, "w", encoding="utf-8") as f:
  100. f.write(json_output)
  101. if __name__ == "__main__":
  102. main()