| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- from typing import List
- import logging
- from pydantic import TypeAdapter
- from config import LECTURER_SHORTHAND_SIZE
- from .models import (
- RawExtractedModule,
- ClassJsonModule,
- ParsedModuleCellTextData,
- DegreeProgram,
- TeachingType,
- Weekday,
- StartsWithMatch,
- )
- def get_modules_for_class_json(
- modules: list[RawExtractedModule],
- class_name: str,
- degree_program: DegreeProgram,
- valid_lecturer_shorthands: list[str] | None = None,
- ) -> list[ClassJsonModule]:
- """
- Parses the Raw Extracted Modules from the class pdf into the format to
- export them to the classes.json file.
- """
- output_modules: list[ClassJsonModule] = []
- for input_module in modules:
- parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text(
- input_module.text, class_name, degree_program, valid_lecturer_shorthands
- )
- output_modules.append(
- ClassJsonModule(
- weekday=input_module.weekday,
- module_shorthand=parsed_data.module_shorthand, # pyright: ignore
- start_seconds=input_module.start_seconds, # pyright: ignore
- end_seconds=input_module.end_seconds, # pyright: ignore
- degree_program=parsed_data.degree_program, # pyright: ignore
- class_name=class_name, # pyright: ignore
- rooms=parsed_data.rooms,
- pages=[input_module.source_page_number],
- part_of_other_classes=parsed_data.part_of_other_classes,
- teaching_type=parsed_data.teaching_type,
- lecturer_shorthands=parsed_data.lecturer_shortnames, # pyright: ignore
- id=get_id(
- class_name,
- parsed_data.module_shorthand,
- input_module.weekday,
- input_module.start_seconds,
- input_module.end_seconds,
- ),
- )
- )
- return output_modules
- def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
- """de-duplicate modules based on their id field"""
- unique_modules_map: dict[str, ClassJsonModule] = {}
- for module in modules:
- if module.id in unique_modules_map:
- existing_module = unique_modules_map[module.id]
- existing_module.pages = sorted(
- list(set(existing_module.pages + module.pages))
- )
- else:
- unique_modules_map[module.id] = module
- return list(unique_modules_map.values())
- def get_modules_json(modules: List[ClassJsonModule]) -> str:
- """
- Serializes a list of ClassJsonModule objects into a formatted JSON string.
- """
- adapter = TypeAdapter(List[ClassJsonModule])
- return adapter.dump_json(modules, by_alias=True).decode("utf-8")
- def parse_mixed_degree_programs(
- degree_program: DegreeProgram, module_shorthand: str
- ) -> DegreeProgram:
- if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM:
- if module_shorthand in ["bplan", "lean"]:
- return DegreeProgram.KONTEXT_BWL
- if module_shorthand in ["wisa", "aua"]:
- return DegreeProgram.KONTEXT_KOMM
- return DegreeProgram.KONTEXT_GSW
- return degree_program
- def parse_module_class_pdf_cell_text(
- text: str,
- class_name: str,
- degree_program: DegreeProgram,
- valid_lecturer_shorthands: list[str] | None = None,
- ) -> ParsedModuleCellTextData:
- """
- Parse a single class pdf module cell text.
- """
- lines = text.split("\n")
- logging.debug("Parsing module cell text: \n%s", text)
- if len(lines) != 3 and len(lines) != 2:
- raise RuntimeError("Invalid Number of Lines in the cell text.")
- if len(lines) == 3:
- rooms = get_rooms(lines[2])
- teaching_type = get_teaching_type(lines[2])
- else:
- rooms = []
- teaching_type = TeachingType.ON_SITE
- module_shorthand = get_module_shorthand(lines[0], class_name)
- return ParsedModuleCellTextData(
- module_shorthand=module_shorthand,
- degree_program=parse_mixed_degree_programs(degree_program, module_shorthand),
- class_name=class_name,
- rooms=rooms,
- part_of_other_classes=[],
- teaching_type=teaching_type,
- lecturer_shortnames=get_lecturer_shortnames(
- lines[1], valid_lecturer_shorthands
- ),
- )
- def get_lecturer_shortnames(
- second_line: str, valid_lecturer_shorthands: list[str] | None = None
- ) -> list[str]:
- """
- Get the lecturer shorthand based on the second class pdf cell line.
- You can provide a list of valid lecturer shorthands for more accurate parsing.
- """
- lecturer_shorthands: list[str] = []
- words = second_line.split(" ")
- if valid_lecturer_shorthands is None:
- for word in words:
- if len(word) == LECTURER_SHORTHAND_SIZE:
- lecturer_shorthands.append(word)
- else:
- logging.warning("Could not get lecturer shorthand from word: %s", word)
- else:
- for word in words:
- exact_starts_with_match = matches_startswith(
- word, valid_lecturer_shorthands
- )
- minus_last_char_starts_with_match = matches_startswith(
- word[:-1], valid_lecturer_shorthands
- )
- if word in valid_lecturer_shorthands:
- lecturer_shorthands.append(word)
- elif is_valid_starts_with_match(exact_starts_with_match):
- lecturer_shorthands.append(exact_starts_with_match.shorthand_found)
- elif is_valid_starts_with_match(minus_last_char_starts_with_match):
- lecturer_shorthands.append(
- minus_last_char_starts_with_match.shorthand_found
- )
- else:
- logging.warning("Could not get lecturer shorthand from word: %s", word)
- return lecturer_shorthands
- def is_valid_starts_with_match(exact_starts_with_match: StartsWithMatch) -> bool:
- return (
- exact_starts_with_match.shorthand_found != ""
- and exact_starts_with_match.num_of_matches == 1
- )
- def matches_startswith(
- word: str, valid_lecturer_shorthands: list[str]
- ) -> StartsWithMatch:
- shorthand_with_start: str = ""
- # catch the number of matches to make sure the matching is unambiguous
- num_of_startwith_matches: int = 0
- for shorthand in valid_lecturer_shorthands:
- if shorthand.startswith(word):
- shorthand_with_start = shorthand
- num_of_startwith_matches += 1
- return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
- def get_module_shorthand(first_line: str, class_name: str) -> str:
- """
- Get the module shorthand based on the first class pdf cell line.
- """
- words = first_line.split(" ")
- if len(words) < 1:
- raise RuntimeError("Cannot extract module shorthand")
- word = words[0]
- if len(words) == 1:
- for i in reversed(range(len(class_name) + 1)):
- if word.endswith(class_name[0:i]):
- word = word[: word.rfind(class_name[0:i])]
- break
- if len(word) == 0:
- raise RuntimeError("Module shorthand cannot be empty")
- return word
- def get_id(
- class_name: str,
- module_shorthand: str,
- weekday: Weekday,
- start_seconds: int,
- end_seconds: int,
- ) -> str:
- """Calculate the json id of a module."""
- return (
- f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}"
- )
- def get_teaching_type(third_line: str) -> TeachingType:
- """
- Get the teaching type based on the third class pdf cell line.
- """
- if "Online" in third_line:
- return TeachingType.ONLINE
- return TeachingType.ON_SITE
- def get_rooms(third_line: str) -> list[str]:
- """
- Get the rooms based on the third class pdf cell line.
- """
- if "DSMixe" in third_line:
- return []
- words = third_line.split(" ")
- return words
|