Source code for glasses_detector.detector

from dataclasses import dataclass, field
from typing import Callable, ClassVar, Collection, overload, override

import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from torchvision.models.detection import (
    fasterrcnn_mobilenet_v3_large_320_fpn,
    fasterrcnn_mobilenet_v3_large_fpn,
    ssdlite320_mobilenet_v3_large,
)
from torchvision.transforms.v2.functional import to_image, to_pil_image
from torchvision.utils import draw_bounding_boxes

from .architectures import TinyBinaryDetector
from .components import BaseGlassesModel
from .components.pred_type import Default
from .utils import FilePath, copy_signature


[docs] @dataclass class GlassesDetector(BaseGlassesModel): r"""**Binary** detector to check where the glasses are in the image. This class allows to perform binary glasses and eye-area detection. By *binary*, it means only a single class is detected. It is possible to specify a particular kind of detection to perform, e.g., standalone glasses, worn glasses, or just the eye area. Important: The detector cannot determine whether or not the glasses are present in the image, i.e., it will always try to predict a bounding box. If you are not sure whether the glasses may be present, please additionally use :class:`~glasses_detector.classifier.GlassesClassifier`. Warning: The pre-trained models are trained on datasets that contain just a single bounding box per image. For this reason, the number of predicted bounding boxes will always be 1. If you want to detect multiple objects in the image, please train custom models on custom datasets or share those datasets with me :). Note: If you want to use a custom inner :attr:`model`, e.g., by instantiating through :meth:`from_model`, please ensure that during inference in evaluation mode it outputs a list of dictionaries (one for each image in the batch) with at least one key being ``"boxes"`` which corresponds to the bounding boxes of the detected objects. ---- .. dropdown:: Performance of the Pre-trained Detectors :icon: graph :color: info :animate: fade-in-slide-down :name: Performance of the Pre-trained Detectors +----------------+------------+--------------------------+---------------------+--------------------------+-------------------------+ | Kind | Size | MSLE :math:`\downarrow` | F1 :math:`\uparrow` | R2 :math:`\uparrow` | IoU :math:`\uparrow` | +================+============+==========================+=====================+==========================+=========================+ | | ``small`` | 0.0531 | 1.0 | 0.9360 | 0.6188 | | +------------+--------------------------+---------------------+--------------------------+-------------------------+ | ``eyes`` | ``medium`` | 0.0523 | 1.0 | 0.9519 | 0.6272 | | +------------+--------------------------+---------------------+--------------------------+-------------------------+ | | ``large`` | TBA | 1.0 | TBA | TBA | +----------------+------------+--------------------------+---------------------+--------------------------+-------------------------+ | | ``small`` | 0.4787 | 1.0 | 0.8328 | 0.6485 | | +------------+--------------------------+---------------------+--------------------------+-------------------------+ | ``solo`` | ``medium`` | 0.8282 | 1.0 | 0.9267 | 0.7731 | | +------------+--------------------------+---------------------+--------------------------+-------------------------+ | | ``large`` | TBA | 1.0 | TBA | TBA | +----------------+------------+--------------------------+---------------------+--------------------------+-------------------------+ | | ``small`` | 0.2585 | 1.0 | 0.8128 | 0.5427 | | +------------+--------------------------+---------------------+--------------------------+-------------------------+ | ``worn`` | ``medium`` | 0.1352 | 1.0 | 0.9432 | 0.7568 | | +------------+--------------------------+---------------------+--------------------------+-------------------------+ | | ``large`` | TBA | 1.0 | TBA | TBA | +----------------+------------+--------------------------+---------------------+--------------------------+-------------------------+ **NB**: **F1 score** is useless because there is only one class, but is still here to emphasize this fact. Not even background is considered as a class - bbox prediction will always happen. .. dropdown:: Size Information of the Pre-trained Detectors :icon: info :color: info :animate: fade-in-slide-down :name: Size Information of the Pre-trained Detectors +----------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------+---------------------------+--------------------------------+----------------------------------+ | Size | Architecture | Params | GFLOPs | Memory (MB) | Filesize (MB) | +================+====================================================================================================================================+===========================+===========================+================================+==================================+ | ``small`` | :class:`Tiny Detector <.architectures.tiny_binary_detector.TinyBinaryDetector>` | 0.23M | 0.001 | 33.99 | 0.91 | +----------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------+---------------------------+--------------------------------+----------------------------------+ | ``medium`` | :func:`SSD Lite <torchvision.models.detection.ssdlite320_mobilenet_v3_large>` :cite:p:`liu2016ssd,howard2019searching` | 3.71M | 0.51 | 316.84 | 14.46 | +----------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------+---------------------------+--------------------------------+----------------------------------+ | ``large`` | :func:`Faster R-CNN <torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn>` :cite:p:`ren2015faster,howard2019searching` | TBA | TBA | TBA | TBA | +----------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------+---------------------------+--------------------------------+----------------------------------+ Examples -------- Let's instantiate the detector with default parameters: .. code-block:: python >>> from glasses_detector import GlassesDetector >>> det = GlassesDetector() First, we can perform a raw prediction on an image expressed as either a path, a :class:`PIL Image<PIL.Image.Image>` or a :class:`numpy array<numpy.ndarray>`. See :meth:`predict` for more details. .. code-block:: python >>> det(np.random.randint(0, 256, size=(16, 16, 3), dtype=np.uint8), format="int") [[0, 0, 1, 1]] >>> det(["path/to/image1.jpg", "path/to/image2.jpg"], format="str") 'BBoxes: 12 34 56 78; 90 12 34 56' We can also use a more specific method :meth:`process_file` which allows to save the results to a file: .. code-block:: python >>> det.process_file("path/to/img.jpg", "path/to/pred.jpg", show=True) ... # opens a new image window with drawn bboxes >>> det.process_file(["img1.jpg", "img2.jpg"], "preds.npy", format="bool") >>> np.load("preds.npy").shape (2, 256, 256) Finally, we can also use :meth:`process_dir` to process all images in a directory and save the predictions to a file or a directory: .. code-block:: python >>> det.process_dir("path/to/dir", "path/to/preds.json", format="float") >>> subprocess.run(["cat", "path/to/preds.json"]) { "img1.jpg": [[0.1, 0.2, 0.3, 0.4]], "img2.jpg": [[0.5, 0.6, 0.7, 0.8], [0.2, 0.8, 0.4, 0.9]], ... } >>> det.process_dir("path/to/dir", "path/to/pred_dir", ext=".jpg") >>> subprocess.run(["ls", "path/to/pred_dir"]) img1.jpg img2.jpg ... Args: kind (str, optional): The kind of objects to perform the detection for. Available options are: +-------------------+-------------------------------------+ | | | +-------------------+-------------------------------------+ | ``"eyes"`` | No glasses, just the eye area | +-------------------+-------------------------------------+ | ``"solo"`` | Any standalone glasses in the wild | +-------------------+-------------------------------------+ | ``"worn"`` | Any glasses that are worn by people | +-------------------+-------------------------------------+ Categories are not very strict, for example, ``"worn"`` may also detect glasses on the table. Defaults to ``"worn"``. size (str, optional): The size of the model to use (check :attr:`.ALLOWED_SIZE_ALIASES` for size aliases). Available options are: +-------------------------+-------------------------------------------------------------+ | | | +-------------------------+-------------------------------------------------------------+ | ``"small"`` or ``"s"`` | Very few parameters but lower accuracy | +-------------------------+-------------------------------------------------------------+ | ``"medium"`` or ``"m"`` | A balance between the number of parameters and the accuracy | +-------------------------+-------------------------------------------------------------+ | ``"large"`` or ``"l"`` | Large number of parameters but higher accuracy | +-------------------------+-------------------------------------------------------------+ Please check: * `Performance of the Pre-trained Detectors`_: to see the results of the pre-trained models for each size depending on :attr:`kind`. * `Size Information of the Pre-trained Detectors`_: to see which architecture each size maps to and the details about the number of parameters. Defaults to ``"medium"``. weights (bool | str | None, optional): Whether to load weights from a custom URL (or a local file if they're already downloaded) which will be inferred based on model's :attr:`kind` and :attr:`size`. If a string is provided, it will be used as a custom path or a URL (determined automatically) to the model weights. Defaults to :data:`True`. device (str | torch.device | None, optional): Device to cast the model to (once it is loaded). If specified as :data:`None`, it will be automatically checked if `CUDA <https://developer.nvidia.com/cuda-toolkit>`_ or `MPS <https://developer.apple.com/documentation/metalperformanceshaders>`_ is supported. Defaults to :data:`None`. """ kind: str = "worn" size: str = "medium" weights: bool | str | None = True task: str = field(default="detection", init=False) DEFAULT_SIZE_MAP: ClassVar[dict[str, int]] = { "small": {"name": "tinydetnet_v1", "version": "v1.0.0"}, "medium": {"name": "ssdlite320_mobilenet_v3_large", "version": "v1.0.0"}, "large": {"name": "fasterrcnn_mobilenet_v3_large_fpn", "version": "v1.1.0"}, } DEFAULT_KIND_MAP: ClassVar[dict[str, str]] = { "eyes": DEFAULT_SIZE_MAP, "solo": DEFAULT_SIZE_MAP, "worn": DEFAULT_SIZE_MAP, } @staticmethod @override def create_model(model_name: str) -> nn.Module: match model_name: case "tinydetnet_v1": m = TinyBinaryDetector() case "ssdlite320_mobilenet_v3_large": m = ssdlite320_mobilenet_v3_large( num_classes=2, detections_per_img=1, topk_candidates=10, ) case "fasterrcnn_mobilenet_v3_large_fpn": m = fasterrcnn_mobilenet_v3_large_fpn( num_classes=2, box_detections_per_img=1, box_batch_size_per_image=10, ) case _: raise ValueError(f"{model_name} is not a valid choice!") return m
[docs] @staticmethod def draw_boxes( image: Image.Image | np.ndarray | torch.Tensor, boxes: list[list[int | float]] | np.ndarray | torch.Tensor, labels: list[str] | None = None, colors: ( str | tuple[int, int, int] | list[str | tuple[int, int, int]] | None ) = "red", fill: bool = False, width: int = 3, font: str | None = None, font_size: int | None = None, ) -> Image.Image: """Draws bounding boxes on the image. Takes the original image and the bounding boxes and draws the them on the image. Optionally, the labels can be provided to write the label next to the bounding box. See Also: * :func:`~torchvision.utils.draw_bounding_boxes` for more details about how the bounding boxes are drawn. * :func:`~torchvision.transforms.v2.functional.to_image` for more details about the expected formats if the input image is of type :class:`PIL.Image.Image` or :class:`numpy.ndarray`. Args: image (PIL.Image.Image | numpy.ndarray | torch.Tensor): The original image. It can be either a *PIL* :class:`~PIL.Image.Image`, a *numpy* :class:`~numpy.ndarray` of shape ``(H, W, 3)`` or ``(H, W)`` and type :attr:`~numpy.uint8` or a *torch* :class:`~torch.Tensor` of shape ``(3, H, W)`` or ``(H, W)`` and type :attr:`~torch.uint8`. boxes (list[list[int | float]] | numpy.ndarray | torch.Tensor): The bounding boxes to draw. The expected shape is ``(N, 4)`` where ``N`` is the number of bounding boxes and the last dimension corresponds to the coordinates of the bounding box in the following order: ``x_min``, ``y_min``, ``x_max``, ``y_max``. labels (list[str] | None, optional): The labels corresponding to ``N`` bounding boxes. If :data:`None`, no labels will be written next to the drawn bounding boxes. Defaults to :data:`None`. colors (list[str | tuple[int, int, int]] | str | tuple[int, int, int] | None, optional): List containing the colors of the boxes or single color for all boxes. The color can be represented as PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``. If :data:`None`, random colors are generated for boxes. Defaults to ``"red"``. fill (bool, optional): If :data:`True`, fills the bounding box with the specified color. Defaults to :data:`False`. width (int, optional): Width of bounding box used when calling :meth:`~PIL.ImageDraw.rectangle`. Defaults to ``3``. font (str | None, optional): A filename containing a *TrueType* font. If the file is not found in this filename, the loader may also search in other directories, such as the ``fonts/`` directory on Windows or ``/Library/Fonts/``, ``/System/Library/Fonts/`` and ``~/Library/Fonts/`` on macOS. Defaults to :data:`None`. font_size (int | None, optional): The requested font size in points used when calling :meth:`~PIL.ImageFont.truetype`. Defaults to :data:`None`. Returns: PIL.Image.Image: The image with bounding boxes drawn on it. """ if not isinstance(boxes, torch.Tensor): # Convert bboxes to torch Tensor boxes = torch.tensor(boxes, dtype=torch.float32) # Draw the bounding boxes on the image new_image = draw_bounding_boxes( image=to_image(image), boxes=boxes, labels=labels, colors=colors, fill=fill, width=width, font=font, font_size=font_size, ) return to_pil_image(new_image)
@overload def predict( self, image: FilePath | Image.Image | np.ndarray, format: ( str | Callable[[torch.Tensor], Default] | Callable[[Image.Image, torch.Tensor], Default] ) = "img", input_size: tuple[int, int] | None = (256, 256), ) -> Default: ... @overload def predict( self, image: Collection[FilePath | Image.Image | np.ndarray], format: ( str | Callable[[torch.Tensor], Default] | Callable[[Image.Image, torch.Tensor], Default] ) = "img", input_size: tuple[int, int] | None = (256, 256), ) -> list[Default]: ...
[docs] @override def predict( self, image: ( FilePath | Image.Image | np.ndarray | Collection[FilePath | Image.Image | np.ndarray] ), format: ( Callable[[torch.Tensor], Default] | Callable[[Image.Image, torch.Tensor], Default] ) = "img", output_size: tuple[int, int] | None = None, input_size: tuple[int, int] | None = (256, 256), ) -> Default | list[Default]: """Predicts the bounding box(-es). Takes a path or multiple paths to image files or the loaded images themselves and outputs a formatted prediction for each image indicating the location of the object (typically, glasses). The format of the prediction, i.e., the prediction type is :data:`~glasses_detector.components.pred_type.Default` type which corresponds to :attr:`~.PredType.DEFAULT`. Warning: If the image is provided as :class:`numpy.ndarray`, make sure the last dimension specifies the channels, i.e., last dimension should be of size ``1`` or ``3``. If it is anything else, e.g., if the shape is ``(3, H, W)``, where ``W`` is neither ``1`` nor ``3``, this would be interpreted as 3 grayscale images. Args: image (FilePath | PIL.Image.Image | numpy.ndarray | typing.Collection[FilePath | PIL.Image.Image | numpy.ndarray]): The path(-s) to the image to generate the prediction for or the image(-s) itself represented as :class:`~PIL.Image.Image` or as :class:`~numpy.ndarray`. Note that the image should have values between 0 and 255 and be of RGB format. Normalization is not needed as the channels will be automatically normalized before passing through the network. format (str | dict[bool, Default] | typing.Callable[[torch.Tensor], Default] | typing.Callable[[PIL.Image.Image, torch.Tensor], Default], optional): The string specifying the way to map the predictions to outputs of specific format. These are the following options (if ``image`` is a :class:`~typing.Collection`, then the output will be a :class:`list` of corresponding items of **output type**): .. table:: :widths: 10 10 80 +------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+ | **format** | **output type** | **prediction mapping** | +============+=========================================================================+==============================================================================================================================================================+ | ``"bool"`` | :class:`numpy.ndarray` of type :class:`numpy.bool_` of shape ``(H, W)`` | A :class:`numpy array<numpy.ndarray>` of shape ``(H, W)`` (i.e., ``output_size``) with :data:`True` values for pixels that fall in any of the bounding boxes | +------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``"int"`` | :class:`list` of :class:`list` of :class:`int` | Bounding boxes with integer coordinates w.r.t. the original ``image`` size: ``[[x_min, y_min, x_max, y_max], ...]`` | +------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``"float"``| :class:`list` of :class:`list` of :class:`float` | Bounding boxes with float coordinates normalized between 0 and 1: ``[[x_min, y_min, x_max, y_max], ...]`` | +------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``"str"`` | :class:`str` | A string of the form ``"BBoxes: x_min y_min x_max y_max; ..."`` | +------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ``"img"`` | :class:`PIL.Image.Image` | The original image with bounding boxes drawn on it using default values in :meth:`.draw_boxes` | +------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+ A custom callback function is also possible that specifies how to map the original image (:class:`~PIL.Image.Image`) and the bounding box predictions (:class:`~torch.Tensor` of type :data:`torch.float32` of shape ``(K, 4)`` with ``K`` being the number of detected bboxes), or just the predictions to a formatted :data:`~glasses_detector.components.pred_type.Default` output. Defaults to ``"img"``. output_size (tuple[int, int] | None, optional): The size (width, height), or ``(W, H)``, the prediction (either the bbox coordinates or the image itself) should correspond to. If :data:`None`, the prediction will correspond to the same size as the input image. Defaults to :data:`None`. input_size (tuple[int, int] | None, optional): The size (width, height), or ``(W, H)``, to resize the image to before passing it through the network. If :data:`None`, the image will not be resized. It is recommended to resize it to the size the model was trained on, which by default is ``(256, 256)``. Defaults to ``(256, 256)``. Returns: Default | list[Default]: The formatted prediction or a list of formatted predictions if multiple images were provided. Raises: ValueError: If the specified ``format`` as a string is not recognized. """ def verify_bboxes(ori: Image.Image, boxes: torch.Tensor): # Set output size to original size if not specified w, h = output_size if output_size else ori.size if input_size is not None and input_size != (w, h): # Convert bboxes to output size boxes[:, 0] = boxes[:, 0] * w / input_size[0] boxes[:, 1] = boxes[:, 1] * h / input_size[1] boxes[:, 2] = boxes[:, 2] * w / input_size[0] boxes[:, 3] = boxes[:, 3] * h / input_size[1] return boxes if isinstance(format, str): match format: case "bool": # NumPy array of shape (H, W) with True values for # pixels that fall in any of the bounding boxes def format_fn(ori, pred): pred = verify_bboxes(ori, pred).numpy(force=True) w, h = output_size if output_size else ori.size fn = lambda b, x, y: b[0] <= x <= b[2] and b[1] <= y <= b[3] pred = np.any( [ [[fn(b, x, y) for x in range(w)] for y in range(h)] for b in pred ], axis=0, ) return pred case "int": # Bounding boxes with integer coordinates w.r.t. the # original image size: [[x_min, y_min, x_max, y_max], ...] def format_fn(ori, pred): pred = verify_bboxes(ori, pred) return [[int(p.item()) for p in b] for b in pred] case "float": # Bounding boxes with float coordinates normalized # between 0 and 1: [[x_min, y_min, x_max, y_max], ...] def format_fn(ori, pred): w, h = ori.size if input_size is None else input_size pred = pred / torch.tensor([w, h, w, h], device=pred.device) return [[float(p.item()) for p in b] for b in pred] case "str": # A string of the form # "BBoxes: x_min y_min x_max y_max; ..." def format_fn(ori, pred): pred = verify_bboxes(ori, pred) return "BBoxes: " + "; ".join( [" ".join(map(str, map(int, b))) for b in pred] ) case "img": # The original image with bounding boxes drawn on it # using default values in draw_boxes def format_fn(ori, pred): pred = verify_bboxes(ori, pred) ori = ori.resize(output_size) if output_size else ori img = self.draw_boxes(ori, pred) return img case _: raise ValueError(f"{format} is not a valid choice!") # Convert to function format = format_fn return super().predict(image, format, input_size)
@override def forward(self, x: torch.Tensor) -> list[torch.Tensor]: return [pred["boxes"] for pred in self.model([*x])] @override @copy_signature(predict) def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs)