преди 1 месец · f81386655f
--- a/config/constants.py
+++ b/config/constants.py
@@ -7,6 +7,7 @@ CLASS_PDF_TABLE_SETTINGS = {
 
				     "join_tolerance": 3,
			
 
				     "edge_min_length": 3,
			
 
				 }
			
 
				+CLASS_PDF_MIN_DIMENSIONS = 150
			
 
				 
			
 
				 TOLERANCE = 3
			
 
				 REQUESTS_TIMEOUT = 4
			
--- a/parse/above_table_text.py
+++ b/parse/above_table_text.py
@@ -58,35 +58,44 @@ def get_class_name(third_line: str) -> str:
 
				     first_space_index = third_line.find(" ")
			
 
				     if first_space_index == -1:
			
 
				         raise RuntimeError("No space character found in third line")
			
 
				+    if len(third_line) > 2 and third_line[0:2] == "- ":
			
 
				+        return third_line[2:]
			
 
				     return third_line[0:first_space_index]
			
 
				 
			
 
				 
			
 
				 def get_degree_program(
			
 
				     third_line: str, class_name: str, previous_page_metadata: list[PageMetadata]
			
 
				 ) -> DegreeProgram:
			
 
				+    logging.debug("class_name: '%s'", class_name)
			
 
				     if "Kontext BWL" and "Kommunikation" and "GSW" in third_line:
			
 
				         return DegreeProgram.MIXED_BWL_GSW_KOMM
			
 
				     for degree_program in DegreeProgram:
			
 
				         if degree_program.value in third_line:
			
 
				             return degree_program
			
 
				     logging.warning("Using heuristics to guess the degree_program in %s", third_line)
			
 
				-    for page_metadata in previous_page_metadata:
			
 
				-        if page_metadata.class_name == class_name[:-1]:
			
 
				-            return page_metadata.degree_program
			
 
				-    if class_name[-1] == class_name[-2]:
			
 
				+    try:
			
 
				         for page_metadata in previous_page_metadata:
			
 
				-            if class_name[:-2] in page_metadata.class_name:
			
 
				+            if page_metadata.class_name == class_name[:-1]:
			
 
				                 return page_metadata.degree_program
			
 
				-
			
 
				-    if class_name[1] == "D":
			
 
				-        return DegreeProgram.DATASCIENCE
			
 
				-    if class_name[1] == "I":
			
 
				-        return DegreeProgram.INFORMATIK
			
 
				-    if class_name[1:3] == "iC":
			
 
				-        return DegreeProgram.ICOMPETENCE
			
 
				-
			
 
				-    if class_name == "alle" or class_name[1:4] == "MSE":
			
 
				-        return DegreeProgram.AGNOSTIC
			
 
				+        if class_name[-1] == class_name[-2]:
			
 
				+            for page_metadata in previous_page_metadata:
			
 
				+                if class_name[:-2] in page_metadata.class_name:
			
 
				+                    return page_metadata.degree_program
			
 
				+    except IndexError:
			
 
				+        pass
			
 
				+
			
 
				+    try:
			
 
				+        if class_name[1] == "D":
			
 
				+            return DegreeProgram.DATASCIENCE
			
 
				+        if class_name[1] == "I":
			
 
				+            return DegreeProgram.INFORMATIK
			
 
				+        if class_name[1:3] == "iC":
			
 
				+            return DegreeProgram.ICOMPETENCE
			
 
				+
			
 
				+        if class_name == "alle" or class_name[1:4] == "MSE":
			
 
				+            return DegreeProgram.AGNOSTIC
			
 
				+    except IndexError:
			
 
				+        pass
			
 
				 
			
 
				     raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
			
 
				 
			
--- a/parse/table_extraction.py
+++ b/parse/table_extraction.py
@@ -2,7 +2,7 @@ import logging
 
				 from pdfplumber.page import Page
			
 
				 import pdfplumber
			
 
				 
			
 
				-from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS
			
 
				+from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS, CLASS_PDF_MIN_DIMENSIONS
			
 
				 from .models import (
			
 
				     Weekday,
			
 
				     TimeSlot,
			
@@ -49,7 +49,17 @@ def get_modules_from_weekday(
 
				     """
			
 
				     Extracts the modules (raw text and start/end) of a weekday on a single pdf page
			
 
				     """
			
 
				-    highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				+    try:
			
 
				+        highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				+    except KeyError:
			
 
				+        logging.warning("Highest allowed timeslot was not found. Trying lower one's.")
			
 
				+        for time_slot in allowed_time_slots[:-1]:
			
 
				+            try:
			
 
				+                highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
			
 
				+            except KeyError:
			
 
				+                continue
			
 
				+            finally:
			
 
				+                break
			
 
				     modules = []
			
 
				     while len(unmerged_time_entries.cells) > 0:
			
 
				         area = unmerged_time_entries.cells.pop(0)
			
@@ -126,7 +136,23 @@ def extract_data_from_class_pdf(
 
				                 page_index + 1,
			
 
				                 len(found_tables),
			
 
				             )
			
 
				-            table = found_tables[0]
			
 
				+            usable_table_index: int = 0
			
 
				+            if len(found_tables) > 1:
			
 
				+                num_of_tables_with_at_least_min_dimensions: int = 0
			
 
				+                for table_index, table in enumerate(found_tables):
			
 
				+                    x0, top, x1, bottom = table.bbox
			
 
				+                    width = x1 - x0
			
 
				+                    height = bottom - top
			
 
				+                    logging.debug(
			
 
				+                        "table num %d: width: %d, height: %d",
			
 
				+                        table_index + 1,
			
 
				+                        width,
			
 
				+                        height,
			
 
				+                    )
			
 
				+                    if width >= CLASS_PDF_MIN_DIMENSIONS and height >= CLASS_PDF_MIN_DIMENSIONS:
			
 
				+                        num_of_tables_with_at_least_min_dimensions += 1
			
 
				+                        usable_table_index = table_index
			
 
				+            table = found_tables[usable_table_index]
			
 
				             table_y1 = table.bbox[1]
			
 
				             text_above_table = get_above_table_text(page, table_y1)