Noah Vogt преди 1 месец
родител
ревизия
f81386655f
променени са 3 файла, в които са добавени 54 реда и са изтрити 18 реда
  1. 1 0
      config/constants.py
  2. 24 15
      parse/above_table_text.py
  3. 29 3
      parse/table_extraction.py

+ 1 - 0
config/constants.py

@@ -7,6 +7,7 @@ CLASS_PDF_TABLE_SETTINGS = {
     "join_tolerance": 3,
     "edge_min_length": 3,
 }
+CLASS_PDF_MIN_DIMENSIONS = 150
 
 TOLERANCE = 3
 REQUESTS_TIMEOUT = 4

+ 24 - 15
parse/above_table_text.py

@@ -58,35 +58,44 @@ def get_class_name(third_line: str) -> str:
     first_space_index = third_line.find(" ")
     if first_space_index == -1:
         raise RuntimeError("No space character found in third line")
+    if len(third_line) > 2 and third_line[0:2] == "- ":
+        return third_line[2:]
     return third_line[0:first_space_index]
 
 
 def get_degree_program(
     third_line: str, class_name: str, previous_page_metadata: list[PageMetadata]
 ) -> DegreeProgram:
+    logging.debug("class_name: '%s'", class_name)
     if "Kontext BWL" and "Kommunikation" and "GSW" in third_line:
         return DegreeProgram.MIXED_BWL_GSW_KOMM
     for degree_program in DegreeProgram:
         if degree_program.value in third_line:
             return degree_program
     logging.warning("Using heuristics to guess the degree_program in %s", third_line)
-    for page_metadata in previous_page_metadata:
-        if page_metadata.class_name == class_name[:-1]:
-            return page_metadata.degree_program
-    if class_name[-1] == class_name[-2]:
+    try:
         for page_metadata in previous_page_metadata:
-            if class_name[:-2] in page_metadata.class_name:
+            if page_metadata.class_name == class_name[:-1]:
                 return page_metadata.degree_program
-
-    if class_name[1] == "D":
-        return DegreeProgram.DATASCIENCE
-    if class_name[1] == "I":
-        return DegreeProgram.INFORMATIK
-    if class_name[1:3] == "iC":
-        return DegreeProgram.ICOMPETENCE
-
-    if class_name == "alle" or class_name[1:4] == "MSE":
-        return DegreeProgram.AGNOSTIC
+        if class_name[-1] == class_name[-2]:
+            for page_metadata in previous_page_metadata:
+                if class_name[:-2] in page_metadata.class_name:
+                    return page_metadata.degree_program
+    except IndexError:
+        pass
+
+    try:
+        if class_name[1] == "D":
+            return DegreeProgram.DATASCIENCE
+        if class_name[1] == "I":
+            return DegreeProgram.INFORMATIK
+        if class_name[1:3] == "iC":
+            return DegreeProgram.ICOMPETENCE
+
+        if class_name == "alle" or class_name[1:4] == "MSE":
+            return DegreeProgram.AGNOSTIC
+    except IndexError:
+        pass
 
     raise RuntimeError(f"No Valid DegreeProgram found in line {third_line}")
 

+ 29 - 3
parse/table_extraction.py

@@ -2,7 +2,7 @@ import logging
 from pdfplumber.page import Page
 import pdfplumber
 
-from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS
+from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS, CLASS_PDF_MIN_DIMENSIONS
 from .models import (
     Weekday,
     TimeSlot,
@@ -49,7 +49,17 @@ def get_modules_from_weekday(
     """
     Extracts the modules (raw text and start/end) of a weekday on a single pdf page
     """
-    highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
+    try:
+        highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
+    except KeyError:
+        logging.warning("Highest allowed timeslot was not found. Trying lower one's.")
+        for time_slot in allowed_time_slots[:-1]:
+            try:
+                highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
+            except KeyError:
+                continue
+            finally:
+                break
     modules = []
     while len(unmerged_time_entries.cells) > 0:
         area = unmerged_time_entries.cells.pop(0)
@@ -126,7 +136,23 @@ def extract_data_from_class_pdf(
                 page_index + 1,
                 len(found_tables),
             )
-            table = found_tables[0]
+            usable_table_index: int = 0
+            if len(found_tables) > 1:
+                num_of_tables_with_at_least_min_dimensions: int = 0
+                for table_index, table in enumerate(found_tables):
+                    x0, top, x1, bottom = table.bbox
+                    width = x1 - x0
+                    height = bottom - top
+                    logging.debug(
+                        "table num %d: width: %d, height: %d",
+                        table_index + 1,
+                        width,
+                        height,
+                    )
+                    if width >= CLASS_PDF_MIN_DIMENSIONS and height >= CLASS_PDF_MIN_DIMENSIONS:
+                        num_of_tables_with_at_least_min_dimensions += 1
+                        usable_table_index = table_index
+            table = found_tables[usable_table_index]
             table_y1 = table.bbox[1]
             text_above_table = get_above_table_text(page, table_y1)