img.py 996 B

123456789101112131415161718192021222324252627282930313233343536
  1. import logging
  2. import numpy
  3. from pdfplumber.page import Page
  4. from .models import Area
  5. def is_mostly_white_area(page: Page, area: Area) -> bool:
  6. """
  7. Checks wether an Area can be considered mostly white.
  8. Intended for detecting empty timetable cells.
  9. """
  10. img = (
  11. page.crop((area.x1, area.y1, area.x2, area.y2))
  12. .to_image(resolution=150)
  13. .original.convert("RGB")
  14. )
  15. arr = numpy.array(img)
  16. total_pixels = arr.shape[0] * arr.shape[1]
  17. r = arr[:, :, 0].astype(int)
  18. g = arr[:, :, 1].astype(int)
  19. b = arr[:, :, 2].astype(int)
  20. min_rgb = numpy.minimum(numpy.minimum(r, g), b)
  21. max_rgb = numpy.maximum(numpy.maximum(r, g), b)
  22. channel_spread = max_rgb - min_rgb
  23. is_whitish = (min_rgb >= 250) & (channel_spread <= 25)
  24. total_pixels = arr.shape[0] * arr.shape[1]
  25. whitish_percentage = is_whitish.sum() / total_pixels
  26. logging.debug("whitish: %.2f%%", whitish_percentage * 100)
  27. return whitish_percentage > 0.9