noah
/
modulplaner-backend


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
							import logging
from pdfplumber.page import Page
import pdfplumber

from config import CLASS_PDF_TABLE_SETTINGS, ALLOWED_TIMESLOTS, CLASS_PDF_MIN_DIMENSIONS
from .models import (
    Weekday,
    TimeSlot,
    YLevel,
    RawExtractedModule,
    UnmergedTimeEntries,
    Area,
    HorizontalLine,
    ClassPdfExtractionPageData,
    PageMetadata,
)
from .above_table_text import parse_above_table_text
from .geometry import (
    get_timeslot_for_area,
    is_line_at_bottom,
    is_area_below,
    is_vertical_match,
)
from .img import is_mostly_white_area

allowed_time_slots: list[TimeSlot] = [
    TimeSlot(start_time=timeslot_tuple[0], end_time=timeslot_tuple[1])
    for timeslot_tuple in ALLOWED_TIMESLOTS
]


def get_weekday_from_text(text: str) -> Weekday | None:
    """
    Helper function that tries to get a Weekday from a string.
    Only accepts exact display name matches.
    """
    for weekday in Weekday:
        if weekday.display_name == text:
            return weekday
    return None


def get_modules_from_weekday(
    weekday: Weekday,
    unmerged_time_entries: UnmergedTimeEntries,
    page: Page,
    timeslot_y_levels: dict[TimeSlot, YLevel],
    page_number: int,
) -> list[RawExtractedModule]:
    """
    Extracts the modules (raw text and start/end) of a weekday on a single pdf page
    """
    try:
        highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
    except KeyError:
        logging.warning("Highest allowed timeslot was not found. Trying lower one's.")
        for time_slot in allowed_time_slots[:-1]:
            try:
                highest_y_level = timeslot_y_levels[allowed_time_slots[-1]].y2
            except KeyError:
                continue
            finally:
                break
    modules = []
    while len(unmerged_time_entries.cells) > 0:
        area = unmerged_time_entries.cells.pop(0)
        if is_mostly_white_area(page, area):
            logging.debug("mostly white cell skipped")
            continue
        timeslot = get_timeslot_for_area(area, timeslot_y_levels)
        if timeslot is None:
            raise RuntimeError("Could not match TimeSlot to Cell Area")
        start_seconds = timeslot.start_seconds()
        line_at_bottom_found = False
        while not line_at_bottom_found:
            logging.debug("searching for line at bottom of: %s", area)
            logging.debug("line candidates:")
            for line in unmerged_time_entries.horizontal_lines:
                logging.debug("testing horizontal line: %s", line)
                if is_line_at_bottom(area, line, tolerance=20):
                    line_at_bottom_found = True
                    logging.debug("candidate line found")
                    break
            else:

                if is_vertical_match(area.y2, highest_y_level):
                    logging.debug("highest y level matched")
                    break
                found_matching_next_cell_index = -1
                for index, potential_cell_below in enumerate(
                    unmerged_time_entries.cells
                ):
                    if is_area_below(potential_cell_below, area):
                        found_matching_next_cell_index = index
                        break
                else:
                    raise RuntimeError(
                        f"No matching cell below found to merge with on {weekday}"
                    )
                logging.debug("vertically merging cells for %s", weekday)
                matched_area = unmerged_time_entries.cells.pop(
                    found_matching_next_cell_index
                )
                logging.debug("matched cell area: %s", matched_area)
                area = Area(
                    x1=area.x1, y1=area.y1, x2=matched_area.x2, y2=matched_area.y2
                )

        text = page.crop((area.x1, area.y1, area.x2, area.y2)).extract_text()
        timeslot = get_timeslot_for_area(area, timeslot_y_levels)
        if timeslot is None:
            raise RuntimeError("Could not match TimeSlot to Cell Area")
        end_seconds = timeslot.end_seconds()
        modules.append(
            RawExtractedModule(
                weekday=weekday,
                start_seconds=start_seconds,
                end_seconds=end_seconds,
                text=text,
                source_page_number=page_number,
            )
        )
    return modules


def extract_data_from_class_pdf(
    input_filename: str, lecturers_file=None
) -> list[ClassPdfExtractionPageData]:
    """
    Extracts all data from class timetable pdf's
    """
    extraction_data: list[ClassPdfExtractionPageData] = []
    previous_page_metadata: list[PageMetadata] = []
    unmerged_time_entries_by_weekday: dict[Weekday, UnmergedTimeEntries] = {}
    with pdfplumber.open(input_filename) as pdf:
        for page_index, page in enumerate(pdf.pages):
            weekday_areas: dict[Weekday, Area] = {}
            timeslot_y_levels: dict[TimeSlot, YLevel] = {}
            for day in Weekday:
                weekday_areas[day] = Area(x1=0, y1=0, x2=0, y2=0)

            found_tables = page.find_tables(CLASS_PDF_TABLE_SETTINGS)
            logging.debug(
                "amount of tables found on page %d: %d",
                page_index + 1,
                len(found_tables),
            )
            usable_table_index: int = 0
            if len(found_tables) > 1:
                num_of_tables_with_at_least_min_dimensions: int = 0
                for table_index, table in enumerate(found_tables):
                    x0, top, x1, bottom = table.bbox
                    width = x1 - x0
                    height = bottom - top
                    logging.debug(
                        "table num %d: width: %d, height: %d",
                        table_index + 1,
                        width,
                        height,
                    )
                    if width >= CLASS_PDF_MIN_DIMENSIONS and height >= CLASS_PDF_MIN_DIMENSIONS:
                        num_of_tables_with_at_least_min_dimensions += 1
                        usable_table_index = table_index
            table = found_tables[usable_table_index]
            table_y1 = table.bbox[1]
            text_above_table = get_above_table_text(page, table_y1)

            empty_start_found = False

            # get weekday and timeslot areas
            expected_timeslot_index = 0
            for row_index, row in enumerate(table.rows):
                if row_index == 0:
                    for column_index, cell in enumerate(row.cells):
                        logging.debug("row: %d, col: %d", row_index, column_index)
                        logging.debug(cell)
                        if cell is None:
                            logging.debug("None Table Cell Found")
                        else:
                            cell_text = page.crop(
                                (cell[0], cell[1], cell[2], cell[3])
                            ).extract_text()
                            if not empty_start_found and len(cell_text) == 0:
                                logging.debug("empty start found")
                                empty_start_found = True

                            weekday_enum = get_weekday_from_text(cell_text)
                            if weekday_enum:
                                logging.debug("Weekday %s found", cell_text)
                                weekday_areas[weekday_enum] = Area(
                                    x1=cell[0], y1=cell[3], x2=cell[2], y2=0
                                )
                else:
                    logging.debug("row: %d, col: %d", row_index, 0)
                    cell = row.cells[0]
                    if cell is None:
                        logging.warning("Unexpected None Table Cell Found")
                    else:
                        cell_text = page.crop(
                            (cell[0], cell[1], cell[2], cell[3])
                        ).extract_text()
                        target_timeslot = allowed_time_slots[expected_timeslot_index]
                        if not (
                            target_timeslot.start_time in cell_text
                            and target_timeslot.end_time in cell_text
                        ):
                            logging.warning(
                                "Unexpected Timeslot found: '%s'", cell_text
                            )
                        else:
                            # assumes this is the last timeslot ever
                            if target_timeslot == TimeSlot(
                                start_time="20:30", end_time="21:15"
                            ):
                                for weekday in Weekday:
                                    new_area = Area(
                                        x1=weekday_areas[weekday].x1,
                                        y1=weekday_areas[weekday].y1,
                                        x2=weekday_areas[weekday].x2,
                                        y2=cell[3],
                                    )
                                    weekday_areas[weekday] = new_area
                            timeslot_y_levels[target_timeslot] = YLevel(
                                y1=cell[1], y2=cell[3]
                            )
                            expected_timeslot_index += 1

            for weekday in Weekday:
                unmerged_time_entries_by_weekday[weekday] = UnmergedTimeEntries(
                    cells=[], horizontal_lines=[]
                )
                target_area = weekday_areas[weekday]
                logging.debug("target_area: %s", target_area)

                for row_index, row in enumerate(table.rows):
                    for column_index, cell in enumerate(row.cells):
                        if cell is None:
                            logging.debug("None table cell found")
                            continue
                        logging.debug("row: %d, col: %d", row_index, column_index)
                        logging.debug("cell: %s", cell)
                        if (
                            target_area.x1 <= cell[0]
                            and target_area.y1 <= cell[1]
                            and target_area.x2 >= cell[2]
                            and target_area.y2 >= cell[3]
                        ):
                            cell_dimensions = cell[0], cell[1], cell[2], cell[3]
                            unmerged_time_entries_by_weekday[weekday].cells.append(
                                Area(
                                    x1=cell[0], y1=cell[1], x2=cell[2], y2=cell[3]
                                )
                            )
                            logging.debug("%s cell found", weekday)

                for line_found in page.lines:
                    line_x1 = line_found["x0"]
                    line_x2 = line_found["x1"]
                    line_y1 = line_found["y0"]
                    line_y2 = line_found["y1"]
                    line_bottom = line_found["bottom"]

                    # ignore non horizontal lines
                    if line_y1 != line_y2:
                        continue

                    if target_area.x1 <= line_x1 and target_area.x2 >= line_x2:
                        logging.debug("%s timeslot seperator line found", weekday)
                        unmerged_time_entries_by_weekday[
                            weekday
                        ].horizontal_lines.append(
                            HorizontalLine(x1=line_x1, x2=line_x2, y=line_bottom)
                        )

            all_modules: list[RawExtractedModule] = []
            for weekday in Weekday:
                all_modules.extend(
                    get_modules_from_weekday(
                        weekday,
                        unmerged_time_entries_by_weekday[weekday],
                        page,
                        timeslot_y_levels,
                        page_index + 1,
                    )
                )
            page_metadata = parse_above_table_text(
                text_above_table, previous_page_metadata
            )
            previous_page_metadata.append(page_metadata)
            extraction_data.append(
                ClassPdfExtractionPageData(
                    raw_extracted_modules=all_modules, page_metadata=page_metadata
                )
            )
        return extraction_data


def get_above_table_text(page: Page, table_y1: float) -> str:
    """
    Get the text above the timetable for metadata parsing
    """
    upper_region = page.crop((0, 0, page.width, table_y1))
    text_above_table = upper_region.extract_text()

    logging.debug("Text found above the table:")
    logging.debug(text_above_table)

    return text_above_table