generate_classes_json.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #!/usr/bin/env python3
  2. import logging
  3. from argparse import ArgumentParser
  4. import json
  5. from pydantic import TypeAdapter
  6. from parse import (
  7. extract_data_from_class_pdf,
  8. get_modules_for_class_json,
  9. get_modules_json,
  10. get_classes,
  11. deduplicate_modules,
  12. ClassPdfExtractionPageData,
  13. ClassJsonModule,
  14. )
  15. from config import CLASS_TIMETABLE_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
  16. logger = logging.getLogger("modulplaner-backend")
  17. def get_valid_lecturers(file_path: str) -> list[str]:
  18. """
  19. Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
  20. """
  21. valid_lecturers: list[str] = []
  22. try:
  23. logger.warning("reading lecturers file: '%s'", file_path)
  24. with open(file_path, "r", encoding="utf-8") as f:
  25. data = json.load(f)
  26. if isinstance(data, list):
  27. for entry in data:
  28. if isinstance(entry, dict) and "short" in entry:
  29. valid_lecturers.append(entry["short"])
  30. logger.info(
  31. "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
  32. )
  33. except Exception as e:
  34. logger.error("Failed to load valid lecturers from '%s': %s", file_path, e)
  35. return valid_lecturers
  36. def main() -> None:
  37. parser = ArgumentParser(description="Parse Class Timetable PDF to JSON.")
  38. parser.add_argument(
  39. "-l",
  40. "--lecturers",
  41. help="Path to the lecturers.json file (Optional)",
  42. default=None,
  43. )
  44. parser.add_argument(
  45. "-i",
  46. "--input",
  47. help="Path to the input Class Timetable PDF file",
  48. default=CLASS_TIMETABLE_PDF_INPUT_FILE,
  49. )
  50. parser.add_argument(
  51. "-o",
  52. "--output",
  53. help="Path to the output JSON file",
  54. default=CLASSES_JSON_OUTPUT_FILE,
  55. )
  56. parser.add_argument(
  57. "--save-intermediate",
  58. help="Path to save the intermediate extraction data (JSON format) and exit",
  59. default=None,
  60. )
  61. parser.add_argument(
  62. "--load-intermediate",
  63. help="Path to load the intermediate extraction data from (JSON format) and skip extraction",
  64. default=None,
  65. )
  66. parser.add_argument(
  67. "-j",
  68. "--jobs",
  69. help="Number of parallel jobs to use for extraction (default: 1)",
  70. type=int,
  71. default=1,
  72. )
  73. parser.add_argument(
  74. "--log-level",
  75. help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)",
  76. default="INFO",
  77. type=str.upper,
  78. choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
  79. )
  80. args = parser.parse_args()
  81. lecturers_file = args.lecturers
  82. logging.basicConfig(level=args.log_level)
  83. valid_lecturer_shorthands: list[str] | None = None
  84. if lecturers_file:
  85. valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
  86. extraction_data: list[ClassPdfExtractionPageData]
  87. if args.load_intermediate:
  88. logger.info("Loading intermediate data from %s", args.load_intermediate)
  89. with open(args.load_intermediate, "r", encoding="utf-8") as f:
  90. extraction_data = TypeAdapter(
  91. list[ClassPdfExtractionPageData]
  92. ).validate_json(f.read())
  93. else:
  94. extraction_data = extract_data_from_class_pdf(args.input, num_of_jobs=args.jobs)
  95. if args.save_intermediate:
  96. logger.info("Saving intermediate data to %s", args.save_intermediate)
  97. with open(args.save_intermediate, "w", encoding="utf-8") as f:
  98. f.write(
  99. TypeAdapter(list[ClassPdfExtractionPageData])
  100. .dump_json(extraction_data)
  101. .decode("utf-8")
  102. )
  103. return
  104. parsed_modules: list[ClassJsonModule] = [
  105. module
  106. for data in extraction_data
  107. for module in get_modules_for_class_json(
  108. data.raw_extracted_modules,
  109. data.page_metadata.class_name,
  110. data.page_metadata.degree_program,
  111. get_classes(extraction_data),
  112. valid_lecturer_shorthands,
  113. )
  114. ]
  115. parsed_modules = deduplicate_modules(parsed_modules)
  116. json_output: str = get_modules_json(parsed_modules)
  117. with open(args.output, "w", encoding="utf-8") as f:
  118. f.write(json_output)
  119. if __name__ == "__main__":
  120. main()