parse_modules.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. from typing import List
  2. import logging
  3. from pydantic import TypeAdapter
  4. from config import LECTURER_SHORTHAND_SIZE
  5. from .models import (
  6. RawExtractedModule,
  7. ClassJsonModule,
  8. ParsedModuleCellTextData,
  9. DegreeProgram,
  10. TeachingType,
  11. Weekday,
  12. )
  13. def get_modules_for_class_json(
  14. modules: list[RawExtractedModule],
  15. class_name: str,
  16. degree_program: DegreeProgram,
  17. valid_lecturer_shorthands: list[str] | None = None,
  18. ) -> list[ClassJsonModule]:
  19. output_modules: list[ClassJsonModule] = []
  20. for input_module in modules:
  21. parsed_data: ParsedModuleCellTextData = parse_module_cell_text(
  22. input_module.text, class_name, degree_program, valid_lecturer_shorthands
  23. )
  24. output_modules.append(
  25. ClassJsonModule(
  26. weekday=input_module.weekday,
  27. module_shorthand=parsed_data.module_shorthand, # pyright: ignore
  28. start_seconds=input_module.start_seconds, # pyright: ignore
  29. end_seconds=input_module.end_seconds, # pyright: ignore
  30. degree_program=parsed_data.degree_program, # pyright: ignore
  31. class_name=class_name, # pyright: ignore
  32. rooms=parsed_data.rooms,
  33. pages=[input_module.source_page_number],
  34. part_of_other_classes=parsed_data.part_of_other_classes,
  35. teaching_type=parsed_data.teaching_type,
  36. lecturer_shorthands=parsed_data.lecturer_shortnames, # pyright: ignore
  37. id=get_id(
  38. class_name,
  39. parsed_data.module_shorthand,
  40. input_module.weekday,
  41. input_module.start_seconds,
  42. input_module.end_seconds,
  43. ),
  44. )
  45. )
  46. return output_modules
  47. def deduplicate_modules(modules: list[ClassJsonModule]) -> list[ClassJsonModule]:
  48. """de-duplicate modules based on their id field"""
  49. unique_modules_map: dict[str, ClassJsonModule] = {}
  50. for module in modules:
  51. if module.id in unique_modules_map:
  52. existing_module = unique_modules_map[module.id]
  53. existing_module.pages = sorted(
  54. list(set(existing_module.pages + module.pages))
  55. )
  56. else:
  57. unique_modules_map[module.id] = module
  58. return list(unique_modules_map.values())
  59. def get_modules_json(modules: List[ClassJsonModule]) -> str:
  60. """
  61. Serializes a list of ClassJsonModule objects into a formatted JSON string.
  62. """
  63. adapter = TypeAdapter(List[ClassJsonModule])
  64. return adapter.dump_json(modules, by_alias=True).decode("utf-8")
  65. def parse_mixed_degree_programs(
  66. degree_program: DegreeProgram, module_shorthand: str
  67. ) -> DegreeProgram:
  68. if degree_program == DegreeProgram.MIXED_BWL_GSW_KOMM:
  69. if module_shorthand in ["bplan", "lean"]:
  70. return DegreeProgram.KONTEXT_BWL
  71. if module_shorthand in ["wisa", "aua"]:
  72. return DegreeProgram.KONTEXT_KOMM
  73. return DegreeProgram.KONTEXT_GSW
  74. return degree_program
  75. def parse_module_cell_text(
  76. text: str,
  77. class_name: str,
  78. degree_program: DegreeProgram,
  79. valid_lecturer_shorthands: list[str] | None = None,
  80. ) -> ParsedModuleCellTextData:
  81. lines = text.split("\n")
  82. logging.debug("Parsing module cell text: \n%s", text)
  83. if len(lines) != 3 and len(lines) != 2:
  84. raise RuntimeError("Invalid Number of Lines in the cell text.")
  85. if len(lines) == 3:
  86. rooms = get_rooms(lines[2])
  87. teaching_type = get_teaching_type(lines[2])
  88. else:
  89. rooms = []
  90. teaching_type = TeachingType.ON_SITE
  91. module_shorthand = get_module_shorthand(lines[0], class_name)
  92. return ParsedModuleCellTextData(
  93. module_shorthand=module_shorthand,
  94. degree_program=parse_mixed_degree_programs(degree_program, module_shorthand),
  95. class_name=class_name,
  96. rooms=rooms,
  97. part_of_other_classes=[],
  98. teaching_type=teaching_type,
  99. lecturer_shortnames=get_lecturer_shortnames(
  100. lines[1], valid_lecturer_shorthands
  101. ),
  102. )
  103. def get_lecturer_shortnames(
  104. second_line: str, valid_lecturer_shorthands: list[str] | None = None
  105. ) -> list[str]:
  106. lecturer_shorthands: list[str] = []
  107. words = second_line.split(" ")
  108. if valid_lecturer_shorthands is None:
  109. for word in words:
  110. if len(word) == LECTURER_SHORTHAND_SIZE:
  111. lecturer_shorthands.append(word)
  112. else:
  113. for word in words:
  114. if word in valid_lecturer_shorthands or (
  115. len(word) == LECTURER_SHORTHAND_SIZE and shorthand.startswith(word)
  116. for shorthand in valid_lecturer_shorthands
  117. ):
  118. lecturer_shorthands.append(word)
  119. return lecturer_shorthands
  120. def get_module_shorthand(first_line: str, class_name: str) -> str:
  121. words = first_line.split(" ")
  122. if len(words) < 1:
  123. raise RuntimeError("Cannot extract module shorthand")
  124. word = words[0]
  125. if len(words) == 1:
  126. for i in reversed(range(len(class_name) + 1)):
  127. if word.endswith(class_name[0:i]):
  128. word = word[: word.rfind(class_name[0:i])]
  129. break
  130. if len(word) == 0:
  131. raise RuntimeError("Module shorthand cannot be empty")
  132. return word
  133. def get_id(
  134. class_name: str,
  135. module_shorthand: str,
  136. weekday: Weekday,
  137. start_seconds: int,
  138. end_seconds: int,
  139. ) -> str:
  140. return (
  141. f"{class_name}-{module_shorthand}-{weekday.index}-{start_seconds}-{end_seconds}"
  142. )
  143. def get_teaching_type(third_line: str) -> TeachingType:
  144. if "Online" in third_line:
  145. return TeachingType.ONLINE
  146. return TeachingType.ON_SITE
  147. def get_rooms(third_line: str) -> list[str]:
  148. if "DSMixe" in third_line:
  149. return []
  150. words = third_line.split(" ")
  151. return words