#!/usr/bin/env python3 import logging import argparse import json from parse import ( extract_data_from_class_pdf, get_modules_for_class_json, get_modules_json, deduplicate_modules, ClassPdfExtractionPageData, ClassJsonModule, ) from config import CLASS_PDF_INPUT_FILE, CLASSES_JSON_OUTPUT_FILE def get_valid_lecturers(file_path: str) -> list[str]: """ Reads the lecturers JSON file and extracts a list of valid lecturer shorthands. """ valid_lecturers: list[str] = [] try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): for entry in data: if isinstance(entry, dict) and "short" in entry: valid_lecturers.append(entry["short"]) logging.info( "Loaded %d valid lecturers from %s", len(valid_lecturers), file_path ) except Exception as e: logging.error("Failed to load valid lecturers from '%s': %s", file_path, e) return valid_lecturers def main() -> None: parser = argparse.ArgumentParser(description="Parse class PDF to JSON.") parser.add_argument( "-l", "--lecturers", help="Path to the lecturers.json file", default=None ) parser.add_argument( "-i", "--input", help="Path to the input PDF file", default=CLASS_PDF_INPUT_FILE ) parser.add_argument( "-o", "--output", help="Path to the output JSON file", default=CLASSES_JSON_OUTPUT_FILE, ) parser.add_argument( "lecturers_pos", nargs="?", help="Path to the lecturers.json file (positional)", default=None, ) args = parser.parse_args() lecturers_file = args.lecturers or args.lecturers_pos logging.basicConfig(level=logging.DEBUG) valid_lecturer_shorthands: list[str] | None = None if lecturers_file: valid_lecturer_shorthands = get_valid_lecturers(lecturers_file) extraction_data: list[ClassPdfExtractionPageData] = extract_data_from_class_pdf( args.input ) parsed_modules: list[ClassJsonModule] = [ module for data in extraction_data for module in get_modules_for_class_json( data.raw_extracted_modules, data.page_metadata.class_name, data.page_metadata.degree_program, valid_lecturer_shorthands, ) ] parsed_modules = deduplicate_modules(parsed_modules) json_output: str = get_modules_json(parsed_modules) with open(args.output, "w", encoding="utf-8") as f: f.write(json_output) if __name__ == "__main__": main()