parse_modules.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. from typing import List
  2. import logging
  3. from pydantic import TypeAdapter
  4. from config import LECTURER_SHORTHAND_SIZE
  5. from .models import (
  6. RawExtractedModule,
  7. ClassJsonModule,
  8. ParsedModuleCellTextData,
  9. DegreeProgram,
  10. TeachingType,
  11. Weekday,
  12. StartsWithMatch,
  13. )
  14. def get_modules_for_class_json(
  15. modules: list[RawExtractedModule],
  16. class_name: str,
  17. degree_program: DegreeProgram,
  18. valid_lecturer_shorthands: list[str] | None = None,
  19. ) -> list[ClassJsonModule]:
  20. """
  21. Parses the Raw Extracted Modules from the class pdf into the format to
  22. export them to the classes.json file.
  23. """
  24. output_modules: list[ClassJsonModule] = []
  25. for input_module in modules:
  26. parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text(
  27. input_module.text, class_name, degree_program, valid_lecturer_shorthands
  28. )
  29. output_modules.append(
  30. ClassJsonModule(
  31. weekday=input_module.weekday,
  32. module_shorthand=parsed_data.module_shorthand, # pyright: ignore
  33. start_seconds=input_module.start_seconds, # pyright: ignore
  34. end_seconds=input_module.end_seconds, # pyright: ignore
  35. degree_program=parsed_data.degree_program, # pyright: ignore
  36. class_name=class_name, # pyright: ignore
  37. rooms=parsed_data.rooms,
  38. pages=[input_module.source_page_number],
  39. part_of_other_classes=parsed_data.part_of_other_classes,
  40. teaching_type=parsed_data.teaching_type,
  41. lecturer_shorthands=parsed_data.lecturer_shortnames, # pyright: ignore
  42. id=get_id(
  43. class_name,
  44. parsed_data.module_shorthand,
  45. input_module.weekday,
  46. input_module.start_seconds,
  47. input_module.end_seconds,
  48. ),
  49. )
  50. )
  51. return output_modules
  52. def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
  53. """de-duplicate modules based on their id field"""
  54. unique_modules_map: dict[str, ClassJsonModule] = {}
  55. for module in modules:
  56. if module.id in unique_modules_map:
  57. existing_module = unique_modules_map[module.id]
  58. existing_module.pages = sorted(
  59. list(set(existing_module.pages + module.pages))
  60. )
  61. else:
  62. unique_modules_map[module.id] = module
  63. return list(unique_modules_map.values())
  64. def get_modules_json(modules: List[ClassJsonModule]) -> str:
  65. """
  66. Serializes a list of ClassJsonModule objects into a formatted JSON string.
  67. """
  68. adapter = TypeAdapter(List[ClassJsonModule])
  69. return adapter.dump_json(modules, by_alias=True).decode("utf-8")
  70. def parse_mixed_degree_programs(
  71. degree_program: DegreeProgram, module_shorthand: str
  72. ) -> DegreeProgram:
  73. if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM:
  74. if module_shorthand in ["bplan", "lean"]:
  75. return DegreeProgram.KONTEXT_BWL
  76. if module_shorthand in ["wisa", "aua"]:
  77. return DegreeProgram.KONTEXT_KOMM
  78. return DegreeProgram.KONTEXT_GSW
  79. return degree_program
  80. def parse_module_class_pdf_cell_text(
  81. text: str,
  82. class_name: str,
  83. degree_program: DegreeProgram,
  84. valid_lecturer_shorthands: list[str] | None = None,
  85. ) -> ParsedModuleCellTextData:
  86. """
  87. Parse a single class pdf module cell text.
  88. """
  89. lines = text.split("\n")
  90. logging.debug("Parsing module cell text: \n%s", text)
  91. if len(lines) != 3 and len(lines) != 2:
  92. raise RuntimeError("Invalid Number of Lines in the cell text.")
  93. if len(lines) == 3:
  94. rooms = get_rooms(lines[2])
  95. teaching_type = get_teaching_type(lines[2])
  96. else:
  97. rooms = []
  98. teaching_type = TeachingType.ON_SITE
  99. module_shorthand = get_module_shorthand(lines[0], class_name)
  100. return ParsedModuleCellTextData(
  101. module_shorthand=module_shorthand,
  102. degree_program=parse_mixed_degree_programs(degree_program, module_shorthand),
  103. class_name=class_name,
  104. rooms=rooms,
  105. part_of_other_classes=[],
  106. teaching_type=teaching_type,
  107. lecturer_shortnames=get_lecturer_shortnames(
  108. lines[1], valid_lecturer_shorthands
  109. ),
  110. )
  111. def get_lecturer_shortnames(
  112. second_line: str, valid_lecturer_shorthands: list[str] | None = None
  113. ) -> list[str]:
  114. """
  115. Get the lecturer shorthand based on the second class pdf cell line.
  116. You can provide a list of valid lecturer shorthands for more accurate parsing.
  117. """
  118. lecturer_shorthands: list[str] = []
  119. words = second_line.split(" ")
  120. if valid_lecturer_shorthands is None:
  121. for word in words:
  122. if len(word) == LECTURER_SHORTHAND_SIZE:
  123. lecturer_shorthands.append(word)
  124. else:
  125. logging.warning("Could not get lecturer shorthand from word: %s", word)
  126. else:
  127. for word in words:
  128. exact_starts_with_match = matches_startswith(
  129. word, valid_lecturer_shorthands
  130. )
  131. minus_last_char_starts_with_match = matches_startswith(
  132. word[:-1], valid_lecturer_shorthands
  133. )
  134. if word in valid_lecturer_shorthands:
  135. lecturer_shorthands.append(word)
  136. elif is_valid_starts_with_match(exact_starts_with_match):
  137. lecturer_shorthands.append(exact_starts_with_match.shorthand_found)
  138. elif is_valid_starts_with_match(minus_last_char_starts_with_match):
  139. lecturer_shorthands.append(
  140. minus_last_char_starts_with_match.shorthand_found
  141. )
  142. else:
  143. logging.warning("Could not get lecturer shorthand from word: %s", word)
  144. return lecturer_shorthands
  145. def is_valid_starts_with_match(exact_starts_with_match: StartsWithMatch) -> bool:
  146. return (
  147. exact_starts_with_match.shorthand_found != ""
  148. and exact_starts_with_match.num_of_matches == 1
  149. )
  150. def matches_startswith(
  151. word: str, valid_lecturer_shorthands: list[str]
  152. ) -> StartsWithMatch:
  153. shorthand_with_start: str = ""
  154. # catch the number of matches to make sure the matching is unambiguous
  155. num_of_startwith_matches: int = 0
  156. for shorthand in valid_lecturer_shorthands:
  157. if shorthand.startswith(word):
  158. shorthand_with_start = shorthand
  159. num_of_startwith_matches += 1
  160. return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
  161. def get_module_shorthand(first_line: str, class_name: str) -> str:
  162. """
  163. Get the module shorthand based on the first class pdf cell line.
  164. """
  165. words = first_line.split(" ")
  166. if len(words) < 1:
  167. raise RuntimeError("Cannot extract module shorthand")
  168. word = words[0]
  169. if len(words) == 1:
  170. for i in reversed(range(len(class_name) + 1)):
  171. if word.endswith(class_name[0:i]):
  172. word = word[: word.rfind(class_name[0:i])]
  173. break
  174. if len(word) == 0:
  175. raise RuntimeError("Module shorthand cannot be empty")
  176. return word
  177. def get_id(
  178. class_name: str,
  179. module_shorthand: str,
  180. weekday: Weekday,
  181. start_seconds: int,
  182. end_seconds: int,
  183. ) -> str:
  184. """Calculate the json id of a module."""
  185. return (
  186. f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}"
  187. )
  188. def get_teaching_type(third_line: str) -> TeachingType:
  189. """
  190. Get the teaching type based on the third class pdf cell line.
  191. """
  192. if "Online" in third_line:
  193. return TeachingType.ONLINE
  194. return TeachingType.ON_SITE
  195. def get_rooms(third_line: str) -> list[str]:
  196. """
  197. Get the rooms based on the third class pdf cell line.
  198. """
  199. if "DSMixe" in third_line:
  200. return []
  201. words = third_line.split(" ")
  202. return words