| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- #!/usr/bin/env python3
- import logging
- import argparse
- import json
- from parse import (
- extract_data_from_class_pdf,
- get_modules_for_class_json,
- get_modules_json,
- deduplicate_modules,
- ClassPdfExtractionPageData,
- ClassJsonModule,
- )
- from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE
- def get_valid_lecturers(file_path: str) -> list[str]:
- """
- Reads the lecturers JSON file and extracts a list of valid lecturer shorthands.
- """
- valid_lecturers: list[str] = []
- try:
- with open(file_path, "r", encoding="utf-8") as f:
- data = json.load(f)
- if isinstance(data, list):
- for entry in data:
- if isinstance(entry, dict) and "short" in entry:
- valid_lecturers.append(entry["short"])
- logging.info(
- "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path
- )
- except Exception as e:
- logging.error("Failed to load valid lecturers from '%s': %s", file_path, e)
- return valid_lecturers
- def main() -> None:
- parser = argparse.ArgumentParser(description="Parse class PDF to JSON.")
- parser.add_argument(
- "-l", "--lecturers", help="Path to the lecturers.json file", default=None
- )
- parser.add_argument(
- "-i", "--input", help="Path to the input PDF file", default=CLASS_PDF_INPUT_FILE
- )
- parser.add_argument(
- "-o",
- "--output",
- help="Path to the output JSON file",
- default=CLASSES_JSON_OUTPUT_FILE,
- )
- parser.add_argument(
- "lecturers_pos",
- nargs="?",
- help="Path to the lecturers.json file (positional)",
- default=None,
- )
- args = parser.parse_args()
- lecturers_file = args.lecturers or args.lecturers_pos
- logging.basicConfig(level=logging.DEBUG)
- valid_lecturer_shorthands: list[str] | None = None
- if lecturers_file:
- valid_lecturer_shorthands = get_valid_lecturers(lecturers_file)
- extraction_data: list[ClassPdfExtractionPageData] = extract_data_from_class_pdf(
- args.input
- )
- parsed_modules: list[ClassJsonModule] = [
- module
- for data in extraction_data
- for module in get_modules_for_class_json(
- data.raw_extracted_modules,
- data.page_metadata.class_name,
- data.page_metadata.degree_program,
- valid_lecturer_shorthands,
- )
- ]
- parsed_modules = deduplicate_modules(parsed_modules)
- json_output: str = get_modules_json(parsed_modules)
- with open(args.output, "w", encoding="utf-8") as f:
- f.write(json_output)
- if __name__ == "__main__":
- main()
|