hace 1 mes · 2c184a3790
--- a/parse/__init__.py
+++ b/parse/__init__.py
@@ -3,5 +3,6 @@ from .parse_modules import (
 
				     get_modules_json,
			
 
				     get_modules_for_class_json,
			
 
				     deduplicate_modules,
			
 
				+    get_classes,
			
 
				 )
			
 
				 from .models import ClassPdfExtractionPageData, ClassJsonModule, RawLecturer, Lecturer
			
--- a/parse/above_table_text.py
+++ b/parse/above_table_text.py
@@ -80,12 +80,12 @@ def get_degree_program(
 
				 
			
 
				     if class_name[1] == "D":
			
 
				         return DegreeProgram.DATASCIENCE
			
 
				-    if class_name[1] == "I" or class_name[1:4] == "MSE":
			
 
				+    if class_name[1] == "I":
			
 
				         return DegreeProgram.INFORMATIK
			
 
				     if class_name[1:3] == "iC":
			
 
				         return DegreeProgram.ICOMPETENCE
			
 
				 
			
 
				-    if class_name == "alle":
			
 
				+    if class_name == "alle" or class_name[1:4] == "MSE":
			
 
				         return DegreeProgram.AGNOSTIC
			
 
				 
			
 
				     raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
			
--- a/parse/models.py
+++ b/parse/models.py
@@ -93,7 +93,7 @@ class DegreeProgram(Enum):
 
				     MASCHINENBAU = "Maschinenbau"
			
 
				     SYSTEMTECHNIK = "Systemtechnik"
			
 
				     WIRTSCHAFT_ING = "Wirtschaftsingenieurwesen"
			
 
				-    AGNOSTIC = "agnostic"
			
 
				+    AGNOSTIC = "SG-???"
			
 
				 
			
 
				 
			
 
				 @dataclass
			
--- a/parse/parse_modules.py
+++ b/parse/parse_modules.py
@@ -13,6 +13,7 @@ from .models import (
 
				     TeachingType,
			
 
				     Weekday,
			
 
				     StartsWithMatch,
			
 
				+    ClassPdfExtractionPageData,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -20,6 +21,7 @@ def get_modules_for_class_json(
 
				     modules: list[RawExtractedModule],
			
 
				     class_name: str,
			
 
				     degree_program: DegreeProgram,
			
 
				+    all_class_names: list[str],
			
 
				     valid_lecturer_shorthands: list[str] | None = None,
			
 
				 ) -> list[ClassJsonModule]:
			
 
				     """
			
@@ -30,7 +32,7 @@ def get_modules_for_class_json(
 
				 
			
 
				     for input_module in modules:
			
 
				         parsed_data: ParsedModuleCellTextData = parse_module_class_pdf_cell_text(
			
 
				-            input_module.text, class_name, degree_program, valid_lecturer_shorthands
			
 
				+            input_module.text, class_name, degree_program, all_class_names, valid_lecturer_shorthands
			
 
				         )
			
 
				 
			
 
				         output_modules.append(
			
@@ -97,6 +99,7 @@ def parse_module_class_pdf_cell_text(
 
				     text: str,
			
 
				     class_name: str,
			
 
				     degree_program: DegreeProgram,
			
 
				+    all_class_names: list[str],
			
 
				     valid_lecturer_shorthands: list[str] | None = None,
			
 
				 ) -> ParsedModuleCellTextData:
			
 
				     """
			
@@ -113,7 +116,7 @@ def parse_module_class_pdf_cell_text(
 
				         rooms = []
			
 
				         teaching_type = TeachingType.ON_SITE
			
 
				 
			
 
				-    module_shorthand = get_module_shorthand(lines[0], class_name)
			
 
				+    module_shorthand = get_module_shorthand(lines[0], class_name, all_class_names)
			
 
				 
			
 
				     return ParsedModuleCellTextData(
			
 
				         module_shorthand=module_shorthand,
			
@@ -185,7 +188,9 @@ def matches_startswith(
 
				     return StartsWithMatch(shorthand_with_start, num_of_startwith_matches)
			
 
				 
			
 
				 
			
 
				-def get_module_shorthand(first_line: str, class_name: str) -> str:
			
 
				+def get_module_shorthand(
			
 
				+    first_line: str, class_name: str, all_class_names: list[str]
			
 
				+) -> str:
			
 
				     """
			
 
				     Get the module shorthand based on the first class pdf cell line.
			
 
				     """
			
@@ -195,8 +200,24 @@ def get_module_shorthand(first_line: str, class_name: str) -> str:
 
				     word = words[0]
			
 
				     if len(words) == 1:
			
 
				         for i in reversed(range(len(class_name) + 1)):
			
 
				-            if word.endswith(class_name[0:i]):
			
 
				-                word = word[: word.rfind(class_name[0:i])]
			
 
				+            class_name_part = class_name[0:i]
			
 
				+            if word.endswith(class_name_part):
			
 
				+                word = word[: word.rfind(class_name_part)]
			
 
				+                debug_msg = (
			
 
				+                    f"cut off class name part '{class_name_part}'"
			
 
				+                    + f" of class name '{class_name}' in line '{first_line}'"
			
 
				+                )
			
 
				+                logging.debug(debug_msg)
			
 
				+                break
			
 
				+
			
 
				+        for foreign_class_name in all_class_names:
			
 
				+            if word.endswith(foreign_class_name):
			
 
				+                word = word[: word.rfind(foreign_class_name)]
			
 
				+                logging.debug(
			
 
				+                    "cut off class name '%s' in line '%s'",
			
 
				+                    foreign_class_name,
			
 
				+                    first_line,
			
 
				+                )
			
 
				                 break
			
 
				     if len(word) == 0:
			
 
				         raise RuntimeError("Module shorthand cannot be empty")
			
@@ -234,3 +255,10 @@ def get_rooms(third_line: str) -> list[str]:
 
				 
			
 
				     words = third_line.split(" ")
			
 
				     return words
			
 
				+
			
 
				+
			
 
				+def get_classes(extraction_data: list[ClassPdfExtractionPageData]) -> list[str]:
			
 
				+    """
			
 
				+    Get the classes from the class page's metadata.
			
 
				+    """
			
 
				+    return [page_data.page_metadata.class_name for page_data in extraction_data]
			
--- a/parse_class_pdf.py
+++ b/parse_class_pdf.py
@@ -9,6 +9,7 @@ from parse import (
 
				     extract_data_from_class_pdf,
			
 
				     get_modules_for_class_json,
			
 
				     get_modules_json,
			
 
				+    get_classes,
			
 
				     deduplicate_modules,
			
 
				     ClassPdfExtractionPageData,
			
 
				     ClassJsonModule,
			
@@ -93,6 +94,7 @@ def main() -> None:
 
				             data.raw_extracted_modules,
			
 
				             data.page_metadata.class_name,
			
 
				             data.page_metadata.degree_program,
			
 
				+            get_classes(extraction_data),
			
 
				             valid_lecturer_shorthands,
			
 
				         )
			
 
				     ]