| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- from re import search
- import logging
- from .models import (
- PageMetadata,
- SemesterType,
- Semester,
- ExportTimestamp,
- DegreeProgram,
- Date,
- Time,
- )
- def parse_above_table_text(
- txt: str, previous_page_metadata: list[PageMetadata]
- ) -> PageMetadata:
- lines = txt.split("\n")
- if len(lines) != 3:
- raise RuntimeError("Invalid Number of Lines.")
- semester_type: SemesterType = get_semester_value(lines[0])
- semester: Semester = Semester(get_semester_year(lines[0]), semester_type)
- class_name: str = get_class_name(lines[2])
- degree_program: DegreeProgram = get_degree_program(
- lines[2], class_name, previous_page_metadata
- )
- export_timestamp: ExportTimestamp = get_export_timestamp(lines[1])
- return PageMetadata(semester, export_timestamp, class_name, degree_program)
- def get_export_timestamp(second_line: str) -> ExportTimestamp:
- line_length = len(second_line)
- match = search(r"(\d{1,2})\.(\d{1,2})\.(\d{4})", second_line)
- if match:
- date_dd, date_mm, date_yyyy = [int(entry) for entry in match.groups()]
- else:
- raise RuntimeError("Could not find date for timestamp extraction")
- for index, char in enumerate(second_line):
- if char == ":" and index - 2 >= 0 and index + 2 < line_length:
- try:
- time_hh = int(second_line[index - 2 : index])
- time_mm = int(second_line[index + 1 : index + 3])
- break
- except (TypeError, ValueError) as e:
- raise RuntimeError("Timestamp Extraction failed") from e
- else:
- raise RuntimeError("No Time found")
- return ExportTimestamp(Date(date_yyyy, date_mm, date_dd), Time(time_hh, time_mm))
- def get_class_name(third_line: str) -> str:
- first_space_index = third_line.find(" ")
- if first_space_index == -1:
- raise RuntimeError("No space character found in third line")
- return third_line[0:first_space_index]
- def get_degree_program(
- third_line: str, class_name: str, previous_page_metadata: list[PageMetadata]
- ) -> DegreeProgram:
- if "Kontext BWL" and "Kommunikation" and "GSW" in third_line:
- return DegreeProgram.MIXED_BWL_GSW_KOMM
- for degree_program in DegreeProgram:
- if degree_program.value in third_line:
- return degree_program
- logging.warning("Using heuristics to guess the degree_program in %s", third_line)
- for page_metadata in previous_page_metadata:
- if page_metadata.class_name == class_name[:-1]:
- return page_metadata.degree_program
- if class_name[-1] == class_name[-2]:
- for page_metadata in previous_page_metadata:
- if class_name[:-2] in page_metadata.class_name:
- return page_metadata.degree_program
- if class_name[1] == "D":
- return DegreeProgram.DATASCIENCE
- if class_name[1] == "I":
- return DegreeProgram.INFORMATIK
- if class_name[1:3] == "iC":
- return DegreeProgram.ICOMPETENCE
- if class_name == "alle" or class_name[1:4] == "MSE":
- return DegreeProgram.AGNOSTIC
- raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
- def get_semester_value(first_line: str) -> SemesterType:
- if SemesterType.FS.value in first_line and SemesterType.HS.value not in first_line:
- return SemesterType.FS
- if SemesterType.HS.value in first_line and SemesterType.FS.value not in first_line:
- return SemesterType.HS
- raise RuntimeError("Could not determine SemesterType")
- def get_semester_year(first_line: str) -> int:
- numeric_char_count = 0
- for index, char in enumerate(first_line):
- if char.isdigit():
- numeric_char_count += 1
- if numeric_char_count == 4:
- return int(first_line[index - 4 : index + 1])
- else:
- numeric_char_count = 0
- raise RuntimeError("Could not determine Semester year (yyyy)")
|