parse_modules.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. import logging
  2. from typing import List
  3. from pydantic import TypeAdapter
  4. from config import LECTURER_SHORTHAND_SIZE
  5. from .models import (
  6. RawExtractedModule,
  7. ClassJsonModule,
  8. ParsedModuleCellTextData,
  9. DegreeProgram,
  10. TeachingType,
  11. Weekday,
  12. StartsWithMatch,
  13. ClassPdfExtractionPageData,
  14. )
  15. logger = logging.getLogger("modulplaner-backend.parse_modules")
  16. def get_modules_for_class_json(
  17. modules: list[RawExtractedModule],
  18. class_name: str,
  19. degree_program: DegreeProgram,
  20. all_class_names: list[str],
  21. valid_lecturer_shorthands: list[str] | None = None,
  22. ) -> list[ClassJsonModule]:
  23. """
  24. Parses the Raw Extracted Modules from the Class Timetable PDF into the format to
  25. export them to the classes.json file.
  26. """
  27. output_modules: list[ClassJsonModule] = []
  28. for input_module in modules:
  29. parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text(
  30. input_module.text,
  31. class_name,
  32. degree_program,
  33. all_class_names,
  34. valid_lecturer_shorthands,
  35. )
  36. output_modules.append(
  37. ClassJsonModule(
  38. weekday=input_module.weekday,
  39. module_shorthand=parsed_data.module_shorthand, # pyright: ignore
  40. start_seconds=input_module.start_seconds, # pyright: ignore
  41. end_seconds=input_module.end_seconds, # pyright: ignore
  42. degree_program=parsed_data.degree_program, # pyright: ignore
  43. class_name=class_name, # pyright: ignore
  44. rooms=parsed_data.rooms,
  45. pages=[input_module.source_page_number],
  46. part_of_other_classes=parsed_data.part_of_other_classes,
  47. teaching_type=parsed_data.teaching_type,
  48. lecturer_shorthands=parsed_data.lecturer_shorthands, # pyright: ignore
  49. id=get_id(
  50. class_name,
  51. parsed_data.module_shorthand,
  52. input_module.weekday,
  53. input_module.start_seconds,
  54. input_module.end_seconds,
  55. ),
  56. )
  57. )
  58. return output_modules
  59. def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
  60. """de-duplicate modules based on their id field"""
  61. unique_modules_map: dict[str, ClassJsonModule] = {}
  62. for module in modules:
  63. if module.id in unique_modules_map:
  64. existing_module = unique_modules_map[module.id]
  65. existing_module.pages = sorted(
  66. list(set(existing_module.pages + module.pages))
  67. )
  68. else:
  69. unique_modules_map[module.id] = module
  70. return list(unique_modules_map.values())
  71. def get_modules_json(modules: List[ClassJsonModule]) -> str:
  72. """
  73. Serializes a list of ClassJsonModule objects into a formatted JSON string.
  74. """
  75. adapter = TypeAdapter(List[ClassJsonModule])
  76. return adapter.dump_json(modules, by_alias=True).decode("utf-8")
  77. def parse_mixed_degree_programs(
  78. degree_program: DegreeProgram, module_shorthand: str
  79. ) -> DegreeProgram:
  80. if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM:
  81. if module_shorthand in ["bplan", "lean"]:
  82. return DegreeProgram.KONTEXT_BWL
  83. if module_shorthand in ["wisa", "aua"]:
  84. return DegreeProgram.KONTEXT_KOMM
  85. return DegreeProgram.KONTEXT_GSW
  86. return degree_program
  87. def parse_module_class_pdf_cell_text(
  88. text: str,
  89. class_name: str,
  90. degree_program: DegreeProgram,
  91. all_class_names: list[str],
  92. valid_lecturer_shorthands: list[str] | None = None,
  93. ) -> ParsedModuleCellTextData:
  94. """
  95. Parse a single Class Timetable PDF module cell text.
  96. """
  97. lines = text.split("\n")
  98. logger.debug("Parsing module cell text: \n%s", text)
  99. if len(lines) != 3 and len(lines) != 2:
  100. raise RuntimeError("Invalid Number of Lines in the cell text.")
  101. if len(lines) == 3:
  102. rooms = get_rooms(lines[2])
  103. teaching_type = get_teaching_type(lines[2])
  104. else:
  105. rooms = []
  106. teaching_type = TeachingType.ON_SITE
  107. module_shorthand = get_module_shorthand(lines[0], class_name, all_class_names)
  108. return ParsedModuleCellTextData(
  109. module_shorthand=module_shorthand,
  110. degree_program=parse_mixed_degree_programs(degree_program, module_shorthand),
  111. class_name=class_name,
  112. rooms=rooms,
  113. part_of_other_classes=[],
  114. teaching_type=teaching_type,
  115. lecturer_shorthands=get_lecturer_shorthands(
  116. lines[1], valid_lecturer_shorthands
  117. ),
  118. )
  119. def get_lecturer_shorthands(
  120. second_line: str, valid_lecturer_shorthands: list[str] | None = None
  121. ) -> list[str]:
  122. """
  123. Get the Lecturer Shorthand based on the second Class Timetable PDF cell line.
  124. You can provide a list of valid lecturer shorthands for more accurate parsing.
  125. """
  126. lecturer_shorthands: list[str] = []
  127. words = second_line.split(" ")
  128. if valid_lecturer_shorthands is None:
  129. for word in words:
  130. if len(word) == LECTURER_SHORTHAND_SIZE:
  131. lecturer_shorthands.append(word)
  132. else:
  133. logger.warning("Could not get Lecturer Shorthand from word: %s", word)
  134. else:
  135. for word in words:
  136. exact_starts_with_match = matches_startswith(
  137. word, valid_lecturer_shorthands
  138. )
  139. minus_last_char_starts_with_match = matches_startswith(
  140. word[:-1], valid_lecturer_shorthands
  141. )
  142. if word in valid_lecturer_shorthands:
  143. lecturer_shorthands.append(word)
  144. elif is_valid_starts_with_match(exact_starts_with_match):
  145. lecturer_shorthands.append(exact_starts_with_match.shorthand_found)
  146. elif is_valid_starts_with_match(minus_last_char_starts_with_match):
  147. lecturer_shorthands.append(
  148. minus_last_char_starts_with_match.shorthand_found
  149. )
  150. else:
  151. logger.warning("Could not get Lecturer Shorthand from word: %s", word)
  152. return lecturer_shorthands
  153. def is_valid_starts_with_match(exact_starts_with_match: StartsWithMatch) -> bool:
  154. return (
  155. exact_starts_with_match.shorthand_found != ""
  156. and exact_starts_with_match.num_of_matches == 1
  157. )
  158. def matches_startswith(
  159. word: str, valid_lecturer_shorthands: list[str]
  160. ) -> StartsWithMatch:
  161. shorthand_with_start: str = ""
  162. # catch the number of matches to make sure the matching is unambiguous
  163. num_of_startwith_matches: int = 0
  164. for shorthand in valid_lecturer_shorthands:
  165. if shorthand.startswith(word):
  166. shorthand_with_start = shorthand
  167. num_of_startwith_matches += 1
  168. return StartsWithMatch(
  169. shorthand_found=shorthand_with_start, num_of_matches=num_of_startwith_matches
  170. )
  171. def get_module_shorthand(
  172. first_line: str, class_name: str, all_class_names: list[str]
  173. ) -> str:
  174. """
  175. Get the Module Shorthand based on the first Class Timetable PDF cell line.
  176. """
  177. words = first_line.split(" ")
  178. if len(words) < 1:
  179. raise RuntimeError("Cannot extract Module Shorthand")
  180. word = words[0]
  181. if len(words) == 1:
  182. for i in reversed(range(len(class_name) + 1)):
  183. class_name_part = class_name[0:i]
  184. if word.endswith(class_name_part):
  185. word = word[: word.rfind(class_name_part)]
  186. debug_msg = (
  187. f"cut off class name part '{class_name_part}'"
  188. + f" of class name '{class_name}' in line '{first_line}'"
  189. )
  190. logger.debug(debug_msg)
  191. break
  192. for foreign_class_name in all_class_names:
  193. if word.endswith(foreign_class_name):
  194. word = word[: word.rfind(foreign_class_name)]
  195. logger.debug(
  196. "cut off class name '%s' in line '%s'",
  197. foreign_class_name,
  198. first_line,
  199. )
  200. break
  201. if len(word) == 0:
  202. raise RuntimeError("Module Shorthand cannot be empty")
  203. return word
  204. def get_id(
  205. class_name: str,
  206. module_shorthand: str,
  207. weekday: Weekday,
  208. start_seconds: int,
  209. end_seconds: int,
  210. ) -> str:
  211. """Calculate the json id of a module."""
  212. return (
  213. f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}"
  214. )
  215. def get_teaching_type(third_line: str) -> TeachingType:
  216. """
  217. Get the teaching type based on the third Class Timetable PDF cell line.
  218. """
  219. if "Online" in third_line:
  220. return TeachingType.ONLINE
  221. return TeachingType.ON_SITE
  222. def get_rooms(third_line: str) -> list[str]:
  223. """
  224. Get the rooms based on the third Class Timetable PDF cell line.
  225. """
  226. if "DSMixe" in third_line:
  227. return []
  228. words = third_line.split(" ")
  229. return words
  230. def get_classes(extraction_data: list[ClassPdfExtractionPageData]) -> list[str]:
  231. """
  232. Get the classes from the class page's metadata.
  233. """
  234. return [page_data.page_metadata.class_name for page_data in extraction_data]