from typing import List import logging from pydantic import TypeAdapter from config import LECTURER_SHORTHAND_SIZE from .models import ( RawExtractedModule, ClassJsonModule, ParsedModuleCellTextData, DegreeProgram, TeachingType, Weekday, StartsWithMatch, ) def get_modules_for_class_json( modules: list[RawExtractedModule], class_name: str, degree_program: DegreeProgram, valid_lecturer_shorthands: list[str] | None = None, ) -> list[ClassJsonModule]: """ Parses the Raw Extracted Modules from the class pdf into the format to export them to the classes.json file. """ output_modules: list[ClassJsonModule] = [] for input_module in modules: parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text( input_module.text, class_name, degree_program, valid_lecturer_shorthands ) output_modules.append( ClassJsonModule( weekday=input_module.weekday, module_shorthand=parsed_data.module_shorthand, # pyright: ignore start_seconds=input_module.start_seconds, # pyright: ignore end_seconds=input_module.end_seconds, # pyright: ignore degree_program=parsed_data.degree_program, # pyright: ignore class_name=class_name, # pyright: ignore rooms=parsed_data.rooms, pages=[input_module.source_page_number], part_of_other_classes=parsed_data.part_of_other_classes, teaching_type=parsed_data.teaching_type, lecturer_shorthands=parsed_data.lecturer_shortnames, # pyright: ignore id=get_id( class_name, parsed_data.module_shorthand, input_module.weekday, input_module.start_seconds, input_module.end_seconds, ), ) ) return output_modules def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]: """de-duplicate modules based on their id field""" unique_modules_map: dict[str, ClassJsonModule] = {} for module in modules: if module.id in unique_modules_map: existing_module = unique_modules_map[module.id] existing_module.pages = sorted( list(set(existing_module.pages + module.pages)) ) else: unique_modules_map[module.id] = module return list(unique_modules_map.values()) def get_modules_json(modules: List[ClassJsonModule]) -> str: """ Serializes a list of ClassJsonModule objects into a formatted JSON string. """ adapter = TypeAdapter(List[ClassJsonModule]) return adapter.dump_json(modules, by_alias=True).decode("utf-8") def parse_mixed_degree_programs( degree_program: DegreeProgram, module_shorthand: str ) -> DegreeProgram: if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM: if module_shorthand in ["bplan", "lean"]: return DegreeProgram.KONTEXT_BWL if module_shorthand in ["wisa", "aua"]: return DegreeProgram.KONTEXT_KOMM return DegreeProgram.KONTEXT_GSW return degree_program def parse_module_class_pdf_cell_text( text: str, class_name: str, degree_program: DegreeProgram, valid_lecturer_shorthands: list[str] | None = None, ) -> ParsedModuleCellTextData: """ Parse a single class pdf module cell text. """ lines = text.split("\n") logging.debug("Parsing module cell text: \n%s", text) if len(lines) != 3 and len(lines) != 2: raise RuntimeError("Invalid Number of Lines in the cell text.") if len(lines) == 3: rooms = get_rooms(lines[2]) teaching_type = get_teaching_type(lines[2]) else: rooms = [] teaching_type = TeachingType.ON_SITE module_shorthand = get_module_shorthand(lines[0], class_name) return ParsedModuleCellTextData( module_shorthand=module_shorthand, degree_program=parse_mixed_degree_programs(degree_program, module_shorthand), class_name=class_name, rooms=rooms, part_of_other_classes=[], teaching_type=teaching_type, lecturer_shortnames=get_lecturer_shortnames( lines[1], valid_lecturer_shorthands ), ) def get_lecturer_shortnames( second_line: str, valid_lecturer_shorthands: list[str] | None = None ) -> list[str]: """ Get the lecturer shorthand based on the second class pdf cell line. You can provide a list of valid lecturer shorthands for more accurate parsing. """ lecturer_shorthands: list[str] = [] words = second_line.split(" ") if valid_lecturer_shorthands is None: for word in words: if len(word) == LECTURER_SHORTHAND_SIZE: lecturer_shorthands.append(word) else: logging.warning("Could not get lecturer shorthand from word: %s", word) else: for word in words: exact_starts_with_match = matches_startswith( word, valid_lecturer_shorthands ) minus_last_char_starts_with_match = matches_startswith( word[:-1], valid_lecturer_shorthands ) if word in valid_lecturer_shorthands: lecturer_shorthands.append(word) elif is_valid_starts_with_match(exact_starts_with_match): lecturer_shorthands.append(exact_starts_with_match.shorthand_found) elif is_valid_starts_with_match(minus_last_char_starts_with_match): lecturer_shorthands.append( minus_last_char_starts_with_match.shorthand_found ) else: logging.warning("Could not get lecturer shorthand from word: %s", word) return lecturer_shorthands def is_valid_starts_with_match(exact_starts_with_match: StartsWithMatch) -> bool: return ( exact_starts_with_match.shorthand_found != "" and exact_starts_with_match.num_of_matches == 1 ) def matches_startswith( word: str, valid_lecturer_shorthands: list[str] ) -> StartsWithMatch: shorthand_with_start: str = "" # catch the number of matches to make sure the matching is unambiguous num_of_startwith_matches: int = 0 for shorthand in valid_lecturer_shorthands: if shorthand.startswith(word): shorthand_with_start = shorthand num_of_startwith_matches += 1 return StartsWithMatch(shorthand_with_start, num_of_startwith_matches) def get_module_shorthand(first_line: str, class_name: str) -> str: """ Get the module shorthand based on the first class pdf cell line. """ words = first_line.split(" ") if len(words) < 1: raise RuntimeError("Cannot extract module shorthand") word = words[0] if len(words) == 1: for i in reversed(range(len(class_name) + 1)): if word.endswith(class_name[0:i]): word = word[: word.rfind(class_name[0:i])] break if len(word) == 0: raise RuntimeError("Module shorthand cannot be empty") return word def get_id( class_name: str, module_shorthand: str, weekday: Weekday, start_seconds: int, end_seconds: int, ) -> str: """Calculate the json id of a module.""" return ( f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}" ) def get_teaching_type(third_line: str) -> TeachingType: """ Get the teaching type based on the third class pdf cell line. """ if "Online" in third_line: return TeachingType.ONLINE return TeachingType.ON_SITE def get_rooms(third_line: str) -> list[str]: """ Get the rooms based on the third class pdf cell line. """ if "DSMixe" in third_line: return [] words = third_line.split(" ") return words