above_table_text.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. from re import search
  2. import logging
  3. from .models import (
  4. PageMetadata,
  5. SemesterType,
  6. Semester,
  7. ExportTimestamp,
  8. DegreeProgram,
  9. Date,
  10. Time,
  11. )
  12. def parse_above_table_text(
  13. txt: str, previous_page_metadata: list[PageMetadata]
  14. ) -> PageMetadata:
  15. lines = txt.split("\n")
  16. if len(lines) != 3:
  17. raise RuntimeError("Invalid Number of Lines.")
  18. semester_type: SemesterType = get_semester_value(lines[0])
  19. semester: Semester = Semester(
  20. yyyy=get_semester_year(lines[0]), semester_type=semester_type
  21. )
  22. class_name: str = get_class_name(lines[2])
  23. degree_program: DegreeProgram = get_degree_program(
  24. lines[2], class_name, previous_page_metadata
  25. )
  26. export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
  27. return PageMetadata(
  28. semester=semester,
  29. export_timestamp=export_timestamp,
  30. class_name=class_name,
  31. degree_program=degree_program,
  32. )
  33. def get_export_timestamp(second_line: str) -> ExportTimestamp:
  34. line_length = len(second_line)
  35. match = search(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", second_line)
  36. if match:
  37. date_dd, date_mm, date_yyyy = [int(entry) for entry in match.groups()]
  38. else:
  39. raise RuntimeError("Could not find date for timestamp extraction")
  40. for index, char in enumerate(second_line):
  41. if char == ":" and index - 2 >= 0 and index + 2 < line_length:
  42. try:
  43. time_hh = int(second_line[index - 2 : index])
  44. time_mm = int(second_line[index + 1 : index + 3])
  45. break
  46. except (TypeError, ValueError) as e:
  47. raise RuntimeError("Timestamp Extraction failed") from e
  48. else:
  49. raise RuntimeError("No Time found")
  50. return ExportTimestamp(
  51. date=Date(yyyy=date_yyyy, mm=date_mm, dd=date_dd),
  52. time=Time(hh=time_hh, mm=time_mm),
  53. )
  54. def get_class_name(third_line: str) -> str:
  55. first_space_index = third_line.find(" ")
  56. if first_space_index == -1:
  57. raise RuntimeError("No space character found in third line")
  58. if len(third_line) > 2 and third_line[0:2] == "- ":
  59. return third_line[2:]
  60. return third_line[0:first_space_index]
  61. def get_degree_program(
  62. third_line: str, class_name: str, previous_page_metadata: list[PageMetadata]
  63. ) -> DegreeProgram:
  64. logging.debug("class_name: '%s'", class_name)
  65. if "Kontext BWL" and "Kommunikation" and "GSW" in third_line:
  66. return DegreeProgram.MIXED_BWL_GSW_KOMM
  67. for degree_program in DegreeProgram:
  68. if degree_program.value in third_line:
  69. return degree_program
  70. logging.warning("Using heuristics to guess the degree_program in %s", third_line)
  71. try:
  72. for page_metadata in previous_page_metadata:
  73. if page_metadata.class_name == class_name[:-1]:
  74. return page_metadata.degree_program
  75. if class_name[-1] == class_name[-2]:
  76. for page_metadata in previous_page_metadata:
  77. if class_name[:-2] in page_metadata.class_name:
  78. return page_metadata.degree_program
  79. except IndexError:
  80. pass
  81. try:
  82. if class_name[1] == "D":
  83. return DegreeProgram.DATASCIENCE
  84. if class_name[1] == "I":
  85. return DegreeProgram.INFORMATIK
  86. if class_name[1:3] == "iC":
  87. return DegreeProgram.ICOMPETENCE
  88. if class_name == "alle" or class_name[1:4] == "MSE":
  89. return DegreeProgram.AGNOSTIC
  90. except IndexError:
  91. pass
  92. raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
  93. def get_semester_value(first_line: str) -> SemesterType:
  94. if SemesterType.FS.value in first_line and SemesterType.HS.value not in first_line:
  95. return SemesterType.FS
  96. if SemesterType.HS.value in first_line and SemesterType.FS.value not in first_line:
  97. return SemesterType.HS
  98. raise RuntimeError("Could not determine SemesterType")
  99. def get_semester_year(first_line: str) -> int:
  100. numeric_char_count = 0
  101. for index, char in enumerate(first_line):
  102. if char.isdigit():
  103. numeric_char_count += 1
  104. if numeric_char_count == 4:
  105. return int(first_line[index - 4 : index + 1])
  106. else:
  107. numeric_char_count = 0
  108. raise RuntimeError("Could not determine Semester year (yyyy)")