parse_modules.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. from typing import List
  2. import logging
  3. from pydantic import TypeAdapter
  4. from config import LECTURER_SHORTHAND_SIZE
  5. from .models import (
  6. RawExtractedModule,
  7. ClassJsonModule,
  8. ParsedModuleCellTextData,
  9. DegreeProgram,
  10. TeachingType,
  11. Weekday,
  12. StartsWithMatch,
  13. ClassPdfExtractionPageData,
  14. )
  15. def get_modules_for_class_json(
  16. modules: list[RawExtractedModule],
  17. class_name: str,
  18. degree_program: DegreeProgram,
  19. all_class_names: list[str],
  20. valid_lecturer_shorthands: list[str] | None = None,
  21. ) -> list[ClassJsonModule]:
  22. """
  23. Parses the Raw Extracted Modules from the class pdf into the format to
  24. export them to the classes.json file.
  25. """
  26. output_modules: list[ClassJsonModule] = []
  27. for input_module in modules:
  28. parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text(
  29. input_module.text, class_name, degree_program, all_class_names, valid_lecturer_shorthands
  30. )
  31. output_modules.append(
  32. ClassJsonModule(
  33. weekday=input_module.weekday,
  34. module_shorthand=parsed_data.module_shorthand, # pyright: ignore
  35. start_seconds=input_module.start_seconds, # pyright: ignore
  36. end_seconds=input_module.end_seconds, # pyright: ignore
  37. degree_program=parsed_data.degree_program, # pyright: ignore
  38. class_name=class_name, # pyright: ignore
  39. rooms=parsed_data.rooms,
  40. pages=[input_module.source_page_number],
  41. part_of_other_classes=parsed_data.part_of_other_classes,
  42. teaching_type=parsed_data.teaching_type,
  43. lecturer_shorthands=parsed_data.lecturer_shortnames, # pyright: ignore
  44. id=get_id(
  45. class_name,
  46. parsed_data.module_shorthand,
  47. input_module.weekday,
  48. input_module.start_seconds,
  49. input_module.end_seconds,
  50. ),
  51. )
  52. )
  53. return output_modules
  54. def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
  55. """de-duplicate modules based on their id field"""
  56. unique_modules_map: dict[str, ClassJsonModule] = {}
  57. for module in modules:
  58. if module.id in unique_modules_map:
  59. existing_module = unique_modules_map[module.id]
  60. existing_module.pages = sorted(
  61. list(set(existing_module.pages + module.pages))
  62. )
  63. else:
  64. unique_modules_map[module.id] = module
  65. return list(unique_modules_map.values())
  66. def get_modules_json(modules: List[ClassJsonModule]) -> str:
  67. """
  68. Serializes a list of ClassJsonModule objects into a formatted JSON string.
  69. """
  70. adapter = TypeAdapter(List[ClassJsonModule])
  71. return adapter.dump_json(modules, by_alias=True).decode("utf-8")
  72. def parse_mixed_degree_programs(
  73. degree_program: DegreeProgram, module_shorthand: str
  74. ) -> DegreeProgram:
  75. if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM:
  76. if module_shorthand in ["bplan", "lean"]:
  77. return DegreeProgram.KONTEXT_BWL
  78. if module_shorthand in ["wisa", "aua"]:
  79. return DegreeProgram.KONTEXT_KOMM
  80. return DegreeProgram.KONTEXT_GSW
  81. return degree_program
  82. def parse_module_class_pdf_cell_text(
  83. text: str,
  84. class_name: str,
  85. degree_program: DegreeProgram,
  86. all_class_names: list[str],
  87. valid_lecturer_shorthands: list[str] | None = None,
  88. ) -> ParsedModuleCellTextData:
  89. """
  90. Parse a single class pdf module cell text.
  91. """
  92. lines = text.split("\n")
  93. logging.debug("Parsing module cell text: \n%s", text)
  94. if len(lines) != 3 and len(lines) != 2:
  95. raise RuntimeError("Invalid Number of Lines in the cell text.")
  96. if len(lines) == 3:
  97. rooms = get_rooms(lines[2])
  98. teaching_type = get_teaching_type(lines[2])
  99. else:
  100. rooms = []
  101. teaching_type = TeachingType.ON_SITE
  102. module_shorthand = get_module_shorthand(lines[0], class_name, all_class_names)
  103. return ParsedModuleCellTextData(
  104. module_shorthand=module_shorthand,
  105. degree_program=parse_mixed_degree_programs(degree_program, module_shorthand),
  106. class_name=class_name,
  107. rooms=rooms,
  108. part_of_other_classes=[],
  109. teaching_type=teaching_type,
  110. lecturer_shortnames=get_lecturer_shortnames(
  111. lines[1], valid_lecturer_shorthands
  112. ),
  113. )
  114. def get_lecturer_shortnames(
  115. second_line: str, valid_lecturer_shorthands: list[str] | None = None
  116. ) -> list[str]:
  117. """
  118. Get the lecturer shorthand based on the second class pdf cell line.
  119. You can provide a list of valid lecturer shorthands for more accurate parsing.
  120. """
  121. lecturer_shorthands: list[str] = []
  122. words = second_line.split(" ")
  123. if valid_lecturer_shorthands is None:
  124. for word in words:
  125. if len(word) == LECTURER_SHORTHAND_SIZE:
  126. lecturer_shorthands.append(word)
  127. else:
  128. logging.warning("Could not get lecturer shorthand from word: %s", word)
  129. else:
  130. for word in words:
  131. exact_starts_with_match = matches_startswith(
  132. word, valid_lecturer_shorthands
  133. )
  134. minus_last_char_starts_with_match = matches_startswith(
  135. word[:-1], valid_lecturer_shorthands
  136. )
  137. if word in valid_lecturer_shorthands:
  138. lecturer_shorthands.append(word)
  139. elif is_valid_starts_with_match(exact_starts_with_match):
  140. lecturer_shorthands.append(exact_starts_with_match.shorthand_found)
  141. elif is_valid_starts_with_match(minus_last_char_starts_with_match):
  142. lecturer_shorthands.append(
  143. minus_last_char_starts_with_match.shorthand_found
  144. )
  145. else:
  146. logging.warning("Could not get lecturer shorthand from word: %s", word)
  147. return lecturer_shorthands
  148. def is_valid_starts_with_match(exact_starts_with_match: StartsWithMatch) -> bool:
  149. return (
  150. exact_starts_with_match.shorthand_found != ""
  151. and exact_starts_with_match.num_of_matches == 1
  152. )
  153. def matches_startswith(
  154. word: str, valid_lecturer_shorthands: list[str]
  155. ) -> StartsWithMatch:
  156. shorthand_with_start: str = ""
  157. # catch the number of matches to make sure the matching is unambiguous
  158. num_of_startwith_matches: int = 0
  159. for shorthand in valid_lecturer_shorthands:
  160. if shorthand.startswith(word):
  161. shorthand_with_start = shorthand
  162. num_of_startwith_matches += 1
  163. return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
  164. def get_module_shorthand(
  165. first_line: str, class_name: str, all_class_names: list[str]
  166. ) -> str:
  167. """
  168. Get the module shorthand based on the first class pdf cell line.
  169. """
  170. words = first_line.split(" ")
  171. if len(words) < 1:
  172. raise RuntimeError("Cannot extract module shorthand")
  173. word = words[0]
  174. if len(words) == 1:
  175. for i in reversed(range(len(class_name) + 1)):
  176. class_name_part = class_name[0:i]
  177. if word.endswith(class_name_part):
  178. word = word[: word.rfind(class_name_part)]
  179. debug_msg = (
  180. f"cut off class name part '{class_name_part}'"
  181. + f" of class name '{class_name}' in line '{first_line}'"
  182. )
  183. logging.debug(debug_msg)
  184. break
  185. for foreign_class_name in all_class_names:
  186. if word.endswith(foreign_class_name):
  187. word = word[: word.rfind(foreign_class_name)]
  188. logging.debug(
  189. "cut off class name '%s' in line '%s'",
  190. foreign_class_name,
  191. first_line,
  192. )
  193. break
  194. if len(word) == 0:
  195. raise RuntimeError("Module shorthand cannot be empty")
  196. return word
  197. def get_id(
  198. class_name: str,
  199. module_shorthand: str,
  200. weekday: Weekday,
  201. start_seconds: int,
  202. end_seconds: int,
  203. ) -> str:
  204. """Calculate the json id of a module."""
  205. return (
  206. f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}"
  207. )
  208. def get_teaching_type(third_line: str) -> TeachingType:
  209. """
  210. Get the teaching type based on the third class pdf cell line.
  211. """
  212. if "Online" in third_line:
  213. return TeachingType.ONLINE
  214. return TeachingType.ON_SITE
  215. def get_rooms(third_line: str) -> list[str]:
  216. """
  217. Get the rooms based on the third class pdf cell line.
  218. """
  219. if "DSMixe" in third_line:
  220. return []
  221. words = third_line.split(" ")
  222. return words
  223. def get_classes(extraction_data: list[ClassPdfExtractionPageData]) -> list[str]:
  224. """
  225. Get the classes from the class page's metadata.
  226. """
  227. return [page_data.page_metadata.class_name for page_data in extraction_data]