parse_modules.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. from typing import List
  2. import logging
  3. from pydantic import TypeAdapter
  4. from config import LECTURER_SHORTHAND_SIZE
  5. from .models import (
  6. RawExtractedModule,
  7. ClassJsonModule,
  8. ParsedModuleCellTextData,
  9. DegreeProgram,
  10. TeachingType,
  11. Weekday,
  12. StartsWithMatch,
  13. ClassPdfExtractionPageData,
  14. )
  15. def get_modules_for_class_json(
  16. modules: list[RawExtractedModule],
  17. class_name: str,
  18. degree_program: DegreeProgram,
  19. all_class_names: list[str],
  20. valid_lecturer_shorthands: list[str] | None = None,
  21. ) -> list[ClassJsonModule]:
  22. """
  23. Parses the Raw Extracted Modules from the Class Timetable PDF into the format to
  24. export them to the classes.json file.
  25. """
  26. output_modules: list[ClassJsonModule] = []
  27. for input_module in modules:
  28. parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text(
  29. input_module.text,
  30. class_name,
  31. degree_program,
  32. all_class_names,
  33. valid_lecturer_shorthands,
  34. )
  35. output_modules.append(
  36. ClassJsonModule(
  37. weekday=input_module.weekday,
  38. module_shorthand=parsed_data.module_shorthand, # pyright: ignore
  39. start_seconds=input_module.start_seconds, # pyright: ignore
  40. end_seconds=input_module.end_seconds, # pyright: ignore
  41. degree_program=parsed_data.degree_program, # pyright: ignore
  42. class_name=class_name, # pyright: ignore
  43. rooms=parsed_data.rooms,
  44. pages=[input_module.source_page_number],
  45. part_of_other_classes=parsed_data.part_of_other_classes,
  46. teaching_type=parsed_data.teaching_type,
  47. lecturer_shorthands=parsed_data.lecturer_shorthands, # pyright: ignore
  48. id=get_id(
  49. class_name,
  50. parsed_data.module_shorthand,
  51. input_module.weekday,
  52. input_module.start_seconds,
  53. input_module.end_seconds,
  54. ),
  55. )
  56. )
  57. return output_modules
  58. def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
  59. """de-duplicate modules based on their id field"""
  60. unique_modules_map: dict[str, ClassJsonModule] = {}
  61. for module in modules:
  62. if module.id in unique_modules_map:
  63. existing_module = unique_modules_map[module.id]
  64. existing_module.pages = sorted(
  65. list(set(existing_module.pages + module.pages))
  66. )
  67. else:
  68. unique_modules_map[module.id] = module
  69. return list(unique_modules_map.values())
  70. def get_modules_json(modules: List[ClassJsonModule]) -> str:
  71. """
  72. Serializes a list of ClassJsonModule objects into a formatted JSON string.
  73. """
  74. adapter = TypeAdapter(List[ClassJsonModule])
  75. return adapter.dump_json(modules, by_alias=True).decode("utf-8")
  76. def parse_mixed_degree_programs(
  77. degree_program: DegreeProgram, module_shorthand: str
  78. ) -> DegreeProgram:
  79. if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM:
  80. if module_shorthand in ["bplan", "lean"]:
  81. return DegreeProgram.KONTEXT_BWL
  82. if module_shorthand in ["wisa", "aua"]:
  83. return DegreeProgram.KONTEXT_KOMM
  84. return DegreeProgram.KONTEXT_GSW
  85. return degree_program
  86. def parse_module_class_pdf_cell_text(
  87. text: str,
  88. class_name: str,
  89. degree_program: DegreeProgram,
  90. all_class_names: list[str],
  91. valid_lecturer_shorthands: list[str] | None = None,
  92. ) -> ParsedModuleCellTextData:
  93. """
  94. Parse a single Class Timetable PDF module cell text.
  95. """
  96. lines = text.split("\n")
  97. logging.debug("Parsing module cell text: \n%s", text)
  98. if len(lines) != 3 and len(lines) != 2:
  99. raise RuntimeError("Invalid Number of Lines in the cell text.")
  100. if len(lines) == 3:
  101. rooms = get_rooms(lines[2])
  102. teaching_type = get_teaching_type(lines[2])
  103. else:
  104. rooms = []
  105. teaching_type = TeachingType.ON_SITE
  106. module_shorthand = get_module_shorthand(lines[0], class_name, all_class_names)
  107. return ParsedModuleCellTextData(
  108. module_shorthand=module_shorthand,
  109. degree_program=parse_mixed_degree_programs(degree_program, module_shorthand),
  110. class_name=class_name,
  111. rooms=rooms,
  112. part_of_other_classes=[],
  113. teaching_type=teaching_type,
  114. lecturer_shorthands=get_lecturer_shorthands(
  115. lines[1], valid_lecturer_shorthands
  116. ),
  117. )
  118. def get_lecturer_shorthands(
  119. second_line: str, valid_lecturer_shorthands: list[str] | None = None
  120. ) -> list[str]:
  121. """
  122. Get the Lecturer Shorthand based on the second Class Timetable PDF cell line.
  123. You can provide a list of valid lecturer shorthands for more accurate parsing.
  124. """
  125. lecturer_shorthands: list[str] = []
  126. words = second_line.split(" ")
  127. if valid_lecturer_shorthands is None:
  128. for word in words:
  129. if len(word) == LECTURER_SHORTHAND_SIZE:
  130. lecturer_shorthands.append(word)
  131. else:
  132. logging.warning("Could not get Lecturer Shorthand from word: %s", word)
  133. else:
  134. for word in words:
  135. exact_starts_with_match = matches_startswith(
  136. word, valid_lecturer_shorthands
  137. )
  138. minus_last_char_starts_with_match = matches_startswith(
  139. word[:-1], valid_lecturer_shorthands
  140. )
  141. if word in valid_lecturer_shorthands:
  142. lecturer_shorthands.append(word)
  143. elif is_valid_starts_with_match(exact_starts_with_match):
  144. lecturer_shorthands.append(exact_starts_with_match.shorthand_found)
  145. elif is_valid_starts_with_match(minus_last_char_starts_with_match):
  146. lecturer_shorthands.append(
  147. minus_last_char_starts_with_match.shorthand_found
  148. )
  149. else:
  150. logging.warning("Could not get Lecturer Shorthand from word: %s", word)
  151. return lecturer_shorthands
  152. def is_valid_starts_with_match(exact_starts_with_match: StartsWithMatch) -> bool:
  153. return (
  154. exact_starts_with_match.shorthand_found != ""
  155. and exact_starts_with_match.num_of_matches == 1
  156. )
  157. def matches_startswith(
  158. word: str, valid_lecturer_shorthands: list[str]
  159. ) -> StartsWithMatch:
  160. shorthand_with_start: str = ""
  161. # catch the number of matches to make sure the matching is unambiguous
  162. num_of_startwith_matches: int = 0
  163. for shorthand in valid_lecturer_shorthands:
  164. if shorthand.startswith(word):
  165. shorthand_with_start = shorthand
  166. num_of_startwith_matches += 1
  167. return StartsWithMatch(
  168. shorthand_found=shorthand_with_start, num_of_matches=num_of_startwith_matches
  169. )
  170. def get_module_shorthand(
  171. first_line: str, class_name: str, all_class_names: list[str]
  172. ) -> str:
  173. """
  174. Get the Module Shorthand based on the first Class Timetable PDF cell line.
  175. """
  176. words = first_line.split(" ")
  177. if len(words) < 1:
  178. raise RuntimeError("Cannot extract Module Shorthand")
  179. word = words[0]
  180. if len(words) == 1:
  181. for i in reversed(range(len(class_name) + 1)):
  182. class_name_part = class_name[0:i]
  183. if word.endswith(class_name_part):
  184. word = word[: word.rfind(class_name_part)]
  185. debug_msg = (
  186. f"cut off class name part '{class_name_part}'"
  187. + f" of class name '{class_name}' in line '{first_line}'"
  188. )
  189. logging.debug(debug_msg)
  190. break
  191. for foreign_class_name in all_class_names:
  192. if word.endswith(foreign_class_name):
  193. word = word[: word.rfind(foreign_class_name)]
  194. logging.debug(
  195. "cut off class name '%s' in line '%s'",
  196. foreign_class_name,
  197. first_line,
  198. )
  199. break
  200. if len(word) == 0:
  201. raise RuntimeError("Module Shorthand cannot be empty")
  202. return word
  203. def get_id(
  204. class_name: str,
  205. module_shorthand: str,
  206. weekday: Weekday,
  207. start_seconds: int,
  208. end_seconds: int,
  209. ) -> str:
  210. """Calculate the json id of a module."""
  211. return (
  212. f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}"
  213. )
  214. def get_teaching_type(third_line: str) -> TeachingType:
  215. """
  216. Get the teaching type based on the third Class Timetable PDF cell line.
  217. """
  218. if "Online" in third_line:
  219. return TeachingType.ONLINE
  220. return TeachingType.ON_SITE
  221. def get_rooms(third_line: str) -> list[str]:
  222. """
  223. Get the rooms based on the third Class Timetable PDF cell line.
  224. """
  225. if "DSMixe" in third_line:
  226. return []
  227. words = third_line.split(" ")
  228. return words
  229. def get_classes(extraction_data: list[ClassPdfExtractionPageData]) -> list[str]:
  230. """
  231. Get the classes from the class page's metadata.
  232. """
  233. return [page_data.page_metadata.class_name for page_data in extraction_data]