above_table_text.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. from re import search
  2. import logging
  3. from .models import (
  4. PageMetadata,
  5. SemesterType,
  6. Semester,
  7. ExportTimestamp,
  8. DegreeProgram,
  9. Date,
  10. Time,
  11. )
  12. def parse_above_table_text(
  13. txt: str, previous_page_metadata: list[PageMetadata]
  14. ) -> PageMetadata:
  15. lines = txt.split("\n")
  16. if len(lines) != 3:
  17. raise RuntimeError("Invalid Number of Lines.")
  18. semester_type: SemesterType = get_semester_value(lines[0])
  19. semester: Semester = Semester(get_semester_year(lines[0]), semester_type)
  20. class_name: str = get_class_name(lines[2])
  21. degree_program: DegreeProgram = get_degree_program(
  22. lines[2], class_name, previous_page_metadata
  23. )
  24. export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
  25. return PageMetadata(semester, export_timestamp, class_name, degree_program)
  26. def get_export_timestamp(second_line: str) -> ExportTimestamp:
  27. line_length = len(second_line)
  28. match = search(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", second_line)
  29. if match:
  30. date_dd, date_mm, date_yyyy = [int(entry) for entry in match.groups()]
  31. else:
  32. raise RuntimeError("Could not find date for timestamp extraction")
  33. for index, char in enumerate(second_line):
  34. if char == ":" and index - 2 >= 0 and index + 2 < line_length:
  35. try:
  36. time_hh = int(second_line[index - 2 : index])
  37. time_mm = int(second_line[index + 1 : index + 3])
  38. break
  39. except (TypeError, ValueError) as e:
  40. raise RuntimeError("Timestamp Extraction failed") from e
  41. else:
  42. raise RuntimeError("No Time found")
  43. return ExportTimestamp(Date(date_yyyy, date_mm, date_dd), Time(time_hh, time_mm))
  44. def get_class_name(third_line: str) -> str:
  45. first_space_index = third_line.find(" ")
  46. if first_space_index == -1:
  47. raise RuntimeError("No space character found in third line")
  48. if len(third_line) > 2 and third_line[0:2] == "- ":
  49. return third_line[2:]
  50. return third_line[0:first_space_index]
  51. def get_degree_program(
  52. third_line: str, class_name: str, previous_page_metadata: list[PageMetadata]
  53. ) -> DegreeProgram:
  54. logging.debug("class_name: '%s'", class_name)
  55. if "Kontext BWL" and "Kommunikation" and "GSW" in third_line:
  56. return DegreeProgram.MIXED_BWL_GSW_KOMM
  57. for degree_program in DegreeProgram:
  58. if degree_program.value in third_line:
  59. return degree_program
  60. logging.warning("Using heuristics to guess the degree_program in %s", third_line)
  61. try:
  62. for page_metadata in previous_page_metadata:
  63. if page_metadata.class_name == class_name[:-1]:
  64. return page_metadata.degree_program
  65. if class_name[-1] == class_name[-2]:
  66. for page_metadata in previous_page_metadata:
  67. if class_name[:-2] in page_metadata.class_name:
  68. return page_metadata.degree_program
  69. except IndexError:
  70. pass
  71. try:
  72. if class_name[1] == "D":
  73. return DegreeProgram.DATASCIENCE
  74. if class_name[1] == "I":
  75. return DegreeProgram.INFORMATIK
  76. if class_name[1:3] == "iC":
  77. return DegreeProgram.ICOMPETENCE
  78. if class_name == "alle" or class_name[1:4] == "MSE":
  79. return DegreeProgram.AGNOSTIC
  80. except IndexError:
  81. pass
  82. raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
  83. def get_semester_value(first_line: str) -> SemesterType:
  84. if SemesterType.FS.value in first_line and SemesterType.HS.value not in first_line:
  85. return SemesterType.FS
  86. if SemesterType.HS.value in first_line and SemesterType.FS.value not in first_line:
  87. return SemesterType.HS
  88. raise RuntimeError("Could not determine SemesterType")
  89. def get_semester_year(first_line: str) -> int:
  90. numeric_char_count = 0
  91. for index, char in enumerate(first_line):
  92. if char.isdigit():
  93. numeric_char_count += 1
  94. if numeric_char_count == 4:
  95. return int(first_line[index - 4 : index + 1])
  96. else:
  97. numeric_char_count = 0
  98. raise RuntimeError("Could not determine Semester year (yyyy)")