above_table_text.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. from re import search
  2. import logging
  3. from .models import (
  4. PageMetadata,
  5. SemesterType,
  6. Semester,
  7. ExportTimestamp,
  8. DegreeProgram,
  9. Date,
  10. Time,
  11. )
  12. logger = logging.getLogger("modulplaner-backend.above_table_text")
  13. def parse_above_table_text(
  14. txt: str, previous_page_metadata: list[PageMetadata]
  15. ) -> PageMetadata:
  16. lines = txt.split("\n")
  17. if len(lines) != 3:
  18. raise RuntimeError("Invalid Number of Lines.")
  19. semester_type: SemesterType = get_semester_value(lines[0])
  20. semester: Semester = Semester(
  21. yyyy=get_semester_year(lines[0]), semester_type=semester_type
  22. )
  23. class_name: str = get_class_name(lines[2])
  24. degree_program: DegreeProgram = get_degree_program(
  25. lines[2], class_name, previous_page_metadata
  26. )
  27. export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
  28. return PageMetadata(
  29. semester=semester,
  30. export_timestamp=export_timestamp,
  31. class_name=class_name,
  32. degree_program=degree_program,
  33. )
  34. def get_export_timestamp(second_line: str) -> ExportTimestamp:
  35. line_length = len(second_line)
  36. match = search(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", second_line)
  37. if match:
  38. date_dd, date_mm, date_yyyy = [int(entry) for entry in match.groups()]
  39. else:
  40. raise RuntimeError("Could not find date for timestamp extraction")
  41. for index, char in enumerate(second_line):
  42. if char == ":" and index - 2 >= 0 and index + 2 < line_length:
  43. try:
  44. time_hh = int(second_line[index - 2 : index])
  45. time_mm = int(second_line[index + 1 : index + 3])
  46. break
  47. except (TypeError, ValueError) as e:
  48. raise RuntimeError("Timestamp Extraction failed") from e
  49. else:
  50. raise RuntimeError("No Time found")
  51. return ExportTimestamp(
  52. date=Date(yyyy=date_yyyy, mm=date_mm, dd=date_dd),
  53. time=Time(hh=time_hh, mm=time_mm),
  54. )
  55. def get_class_name(third_line: str) -> str:
  56. first_space_index = third_line.find(" ")
  57. if first_space_index == -1:
  58. raise RuntimeError("No space character found in third line")
  59. if len(third_line) > 2 and third_line[0:2] == "- ":
  60. return third_line[2:]
  61. return third_line[0:first_space_index]
  62. def get_degree_program(
  63. third_line: str, class_name: str, previous_page_metadata: list[PageMetadata]
  64. ) -> DegreeProgram:
  65. logger.debug("class_name: '%s'", class_name)
  66. if "Kontext BWL" and "Kommunikation" and "GSW" in third_line:
  67. return DegreeProgram.MIXED_BWL_GSW_KOMM
  68. for degree_program in DegreeProgram:
  69. if degree_program.value in third_line:
  70. return degree_program
  71. logger.warning("Using heuristics to guess the degree_program in %s", third_line)
  72. try:
  73. for page_metadata in previous_page_metadata:
  74. if page_metadata.class_name == class_name[:-1]:
  75. return page_metadata.degree_program
  76. if class_name[-1] == class_name[-2]:
  77. for page_metadata in previous_page_metadata:
  78. if class_name[:-2] in page_metadata.class_name:
  79. return page_metadata.degree_program
  80. except IndexError:
  81. pass
  82. try:
  83. if class_name[1] == "D":
  84. return DegreeProgram.DATASCIENCE
  85. if class_name[1] == "I":
  86. return DegreeProgram.INFORMATIK
  87. if class_name[1:3] == "iC":
  88. return DegreeProgram.ICOMPETENCE
  89. if class_name == "alle" or class_name[1:4] == "MSE":
  90. return DegreeProgram.AGNOSTIC
  91. except IndexError:
  92. pass
  93. raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
  94. def get_semester_value(first_line: str) -> SemesterType:
  95. if SemesterType.FS.value in first_line and SemesterType.HS.value not in first_line:
  96. return SemesterType.FS
  97. if SemesterType.HS.value in first_line and SemesterType.FS.value not in first_line:
  98. return SemesterType.HS
  99. raise RuntimeError("Could not determine SemesterType")
  100. def get_semester_year(first_line: str) -> int:
  101. numeric_char_count = 0
  102. for index, char in enumerate(first_line):
  103. if char.isdigit():
  104. numeric_char_count += 1
  105. if numeric_char_count == 4:
  106. return int(first_line[index - 4 : index + 1])
  107. else:
  108. numeric_char_count = 0
  109. raise RuntimeError("Could not determine Semester year (yyyy)")