models.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. from enum import Enum, unique
  2. from typing import Annotated, Any
  3. from dataclasses import dataclass
  4. from pydantic import BaseModel, PlainSerializer, Field, ConfigDict, BeforeValidator
  5. from pdfplumber.page import Page
  6. from pdfplumber.table import Table
  7. class XLevel(BaseModel):
  8. x1: float
  9. x2: float
  10. class YLevel(BaseModel):
  11. y1: float
  12. y2: float
  13. class HorizontalLine(BaseModel):
  14. x1: float
  15. x2: float
  16. y: float
  17. class Area(BaseModel):
  18. x1: float
  19. y1: float
  20. x2: float
  21. y2: float
  22. @unique
  23. class Weekday(Enum):
  24. MONTAG = ("Montag", 0)
  25. DIENSTAG = ("Dienstag", 1)
  26. MITTWOCH = ("Mittwoch", 2)
  27. DONNERSTAG = ("Donnerstag", 3)
  28. FREITAG = ("Freitag", 4)
  29. SAMSTAG = ("Samstag", 5)
  30. SONNTAG = ("Sonntag", 6)
  31. def __init__(self, display_name, index):
  32. self.display_name = display_name
  33. self.index = index
  34. @unique
  35. class SemesterType(Enum):
  36. HS = "Herbstsemester"
  37. FS = "Frühlingssemester"
  38. class Semester(BaseModel):
  39. model_config = ConfigDict(frozen=True)
  40. yyyy: int
  41. semester_type: SemesterType
  42. class Date(BaseModel):
  43. yyyy: int
  44. mm: int
  45. dd: int
  46. class Time(BaseModel):
  47. hh: int
  48. mm: int
  49. class ExportTimestamp(BaseModel):
  50. date: Date
  51. time: Time
  52. @unique
  53. class DegreeProgram(Enum):
  54. DATASCIENCE = "Data Science"
  55. ELEK_U_INFO = "Elektro- und Informationstechnik"
  56. ENER_U_UMWELT = "Energie- und Umwelttechnik"
  57. ICOMPETENCE = "iCompetence"
  58. INFORMATIK = "Informatik"
  59. KONTEXT_BWL = "Kontext BWL"
  60. KONTEXT_ENGLISCH = "Kontext Englisch"
  61. KONTEXT_GSW = "Kontext GSW"
  62. KONTEXT_KOMM = "Kontext Kommunikation"
  63. MIXED_BWL_GSW_KOMM = "Mixed BWL, GSW, Kommunikation"
  64. MASCHINENBAU = "Maschinenbau"
  65. SYSTEMTECHNIK = "Systemtechnik"
  66. WIRTSCHAFT_ING = "Wirtschaftsingenieurwesen"
  67. AGNOSTIC = "SG-???"
  68. class PageMetadata(BaseModel):
  69. semester: Semester
  70. export_timestamp: ExportTimestamp
  71. class_name: str
  72. degree_program: DegreeProgram
  73. class UnmergedTimeEntries(BaseModel):
  74. cells: list[Area]
  75. horizontal_lines: list[HorizontalLine]
  76. class TimeSlot(BaseModel):
  77. model_config = ConfigDict(frozen=True)
  78. start_time: str
  79. end_time: str
  80. def start_seconds(self) -> int:
  81. hours, minutes = map(int, self.start_time.split(":"))
  82. return hours * 3600 + minutes * 60
  83. def end_seconds(self) -> int:
  84. hours, minutes = map(int, self.end_time.split(":"))
  85. return hours * 3600 + minutes * 60
  86. def to_tuple_if_list(v: Any) -> Any:
  87. if isinstance(v, list):
  88. return tuple(v)
  89. return v
  90. # needed for pydantic to correctly parse the custom Weekday Enum
  91. TolerantWeekday = Annotated[Weekday, BeforeValidator(to_tuple_if_list)]
  92. class RawExtractedModule(BaseModel):
  93. weekday: TolerantWeekday
  94. start_seconds: int
  95. end_seconds: int
  96. text: str
  97. source_page_number: int
  98. @unique
  99. class TeachingType(Enum):
  100. ON_SITE = "on_site"
  101. ONLINE = "online"
  102. HYBRID = "hybrid"
  103. BLOCK = "blockmodule"
  104. class RawLecturer(BaseModel):
  105. """
  106. Basic representation of an extracted lecturer from a pdf that needs to be parsed.
  107. """
  108. shorthand: str
  109. firstname: str
  110. surname: str
  111. class Lecturer(BaseModel):
  112. """
  113. JSON-serializable representation of a parsed lecturer ready to be exported.
  114. """
  115. short: str
  116. surname: str
  117. firstname: str
  118. # tells pydantic to to use the index field for the special Weekday Enum
  119. CustomWeekday = Annotated[Weekday, PlainSerializer(lambda v: v.index, return_type=int)]
  120. class ParsedModuleCellTextData(BaseModel):
  121. module_shorthand: str
  122. degree_program: DegreeProgram
  123. class_name: str
  124. rooms: list[str]
  125. part_of_other_classes: list[str]
  126. teaching_type: TeachingType
  127. lecturer_shorthands: list[str]
  128. class ClassJsonModule(BaseModel):
  129. model_config = ConfigDict(populate_by_name=True)
  130. weekday: CustomWeekday
  131. module_shorthand: str = Field(..., alias="name")
  132. start_seconds: int = Field(..., alias="from")
  133. end_seconds: int = Field(..., alias="to")
  134. degree_program: DegreeProgram = Field(..., alias="degree_prg")
  135. class_name: str = Field(..., alias="class")
  136. rooms: list[str]
  137. pages: list[int]
  138. part_of_other_classes: list[str]
  139. id: str
  140. teaching_type: TeachingType
  141. lecturer_shorthands: list[str] = Field(..., alias="teachers")
  142. class ClassPdfExtractionPageData(BaseModel):
  143. raw_extracted_modules: list[RawExtractedModule]
  144. page_metadata: PageMetadata
  145. class RawClassPdfExtractionPageData(BaseModel):
  146. raw_extracted_modules: list[RawExtractedModule]
  147. above_table_text: str
  148. class StartsWithMatch(BaseModel):
  149. shorthand_found: str
  150. num_of_matches: int
  151. @dataclass
  152. class TimeSlotYLevelsCollectionData:
  153. row_index: int
  154. page: Page
  155. table: Table
  156. expected_timeslot_index: int
  157. last_timeslot: TimeSlot
  158. weekday_areas: dict[Weekday, Area]