parse_class_pdf.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env python3
  2. import logging
  3. import argparse
  4. import json
  5. from parse import (
  6. extract_data_from_class_pdf,
  7. get_modules_for_class_json,
  8. get_modules_json,
  9. deduplicate_modules,
  10. ClassPdfExtractionPageData,
  11. ClassJsonModule,
  12. )
  13. from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
  14. def get_valid_lecturers(file_path: str) -> list[str]:
  15. """
  16. Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
  17. """
  18. valid_lecturers: list[str] = []
  19. try:
  20. with open(file_path, "r", encoding="utf-8") as f:
  21. data = json.load(f)
  22. if isinstance(data, list):
  23. for entry in data:
  24. if isinstance(entry, dict) and "short" in entry:
  25. valid_lecturers.append(entry["short"])
  26. logging.info(
  27. "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
  28. )
  29. except Exception as e:
  30. logging.error("Failed to load valid lecturers from '%s': %s", file_path, e)
  31. return valid_lecturers
  32. def main() -> None:
  33. parser = argparse.ArgumentParser(description="Parse class PDF to JSON.")
  34. parser.add_argument(
  35. "-l", "--lecturers", help="Path to the lecturers.json file", default=None
  36. )
  37. parser.add_argument(
  38. "-i", "--input", help="Path to the input PDF file", default=CLASS_PDF_INPUT_FILE
  39. )
  40. parser.add_argument(
  41. "-o",
  42. "--output",
  43. help="Path to the output JSON file",
  44. default=CLASSES_JSON_OUTPUT_FILE,
  45. )
  46. parser.add_argument(
  47. "lecturers_pos",
  48. nargs="?",
  49. help="Path to the lecturers.json file (positional)",
  50. default=None,
  51. )
  52. args = parser.parse_args()
  53. lecturers_file = args.lecturers or args.lecturers_pos
  54. logging.basicConfig(level=logging.DEBUG)
  55. valid_lecturer_shorthands: list[str] | None = None
  56. if lecturers_file:
  57. valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
  58. extraction_data: list[ClassPdfExtractionPageData] = extract_data_from_class_pdf(
  59. args.input
  60. )
  61. parsed_modules: list[ClassJsonModule] = [
  62. module
  63. for data in extraction_data
  64. for module in get_modules_for_class_json(
  65. data.raw_extracted_modules,
  66. data.page_metadata.class_name,
  67. data.page_metadata.degree_program,
  68. valid_lecturer_shorthands,
  69. )
  70. ]
  71. parsed_modules = deduplicate_modules(parsed_modules)
  72. json_output: str = get_modules_json(parsed_modules)
  73. with open(args.output, "w", encoding="utf-8") as f:
  74. f.write(json_output)
  75. if __name__ == "__main__":
  76. main()