parse_class_pdf.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env python3
  2. import logging
  3. from argparse import ArgumentParser
  4. import pickle
  5. import json
  6. from parse import (
  7. extract_data_from_class_pdf,
  8. get_modules_for_class_json,
  9. get_modules_json,
  10. deduplicate_modules,
  11. ClassPdfExtractionPageData,
  12. ClassJsonModule,
  13. )
  14. from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
  15. def get_valid_lecturers(file_path: str) -> list[str]:
  16. """
  17. Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
  18. """
  19. valid_lecturers: list[str] = []
  20. try:
  21. print(f"READING: '{file_path}'")
  22. with open(file_path, "r", encoding="utf-8") as f:
  23. data = json.load(f)
  24. if isinstance(data, list):
  25. for entry in data:
  26. if isinstance(entry, dict) and "short" in entry:
  27. valid_lecturers.append(entry["short"])
  28. logging.info(
  29. "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
  30. )
  31. except Exception as e:
  32. logging.error("Failed to load valid lecturers from '%s': %s", file_path, e)
  33. return valid_lecturers
  34. def main() -> None:
  35. parser = ArgumentParser(description="Parse class PDF to JSON.")
  36. parser.add_argument(
  37. "-l", "--lecturers", help="Path to the lecturers.json file", default=None
  38. )
  39. parser.add_argument(
  40. "-i", "--input", help="Path to the input PDF file", default=CLASS_PDF_INPUT_FILE
  41. )
  42. parser.add_argument(
  43. "-o",
  44. "--output",
  45. help="Path to the output JSON file",
  46. default=CLASSES_JSON_OUTPUT_FILE,
  47. )
  48. parser.add_argument(
  49. "--save-intermediate",
  50. help="Path to save the intermediate extraction data (pickle format) and exit",
  51. default=None,
  52. )
  53. parser.add_argument(
  54. "--load-intermediate",
  55. help="Path to load the intermediate extraction data from (pickle format) and skip extraction",
  56. default=None,
  57. )
  58. args = parser.parse_args()
  59. lecturers_file = args.lecturers
  60. logging.basicConfig(level=logging.DEBUG)
  61. valid_lecturer_shorthands: list[str] | None = None
  62. if lecturers_file:
  63. valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
  64. extraction_data: list[ClassPdfExtractionPageData]
  65. if args.load_intermediate:
  66. logging.info("Loading intermediate data from %s", args.load_intermediate)
  67. with open(args.load_intermediate, "rb") as f:
  68. extraction_data = pickle.load(f)
  69. else:
  70. extraction_data = extract_data_from_class_pdf(args.input)
  71. if args.save_intermediate:
  72. logging.info("Saving intermediate data to %s", args.save_intermediate)
  73. with open(args.save_intermediate, "wb") as f:
  74. pickle.dump(extraction_data, f)
  75. return
  76. parsed_modules: list[ClassJsonModule] = [
  77. module
  78. for data in extraction_data
  79. for module in get_modules_for_class_json(
  80. data.raw_extracted_modules,
  81. data.page_metadata.class_name,
  82. data.page_metadata.degree_program,
  83. valid_lecturer_shorthands,
  84. )
  85. ]
  86. parsed_modules = deduplicate_modules(parsed_modules)
  87. json_output: str = get_modules_json(parsed_modules)
  88. with open(args.output, "w", encoding="utf-8") as f:
  89. f.write(json_output)
  90. if __name__ == "__main__":
  91. main()