Source code for glasses_detector.architectures.tiny_binary_detector

import torch
import torch.nn as nn



[docs]
class TinyBinaryDetector(nn.Module):
    """Tiny binary detector.

    This is a custom detector created with the aim to contain very few
    parameters while maintaining a reasonable accuracy. It only has
    several sequential convolutional and pooling blocks (with
    batch-norm in between).

    Note:
        I tried varying the architecture, including activations,
        convolution behavior (groups and stride), pooling, and layer
        structure. This also includes residual and dense connections,
        as well as combinations. Turns out, they do not perform as well
        as the current architecture which is just a bunch of
        CONV-RELU-BN-MAXPOOL blocks with no paddings.
    """

    def __init__(self):
        super().__init__()

        # Several convolutional blocks
        self.features = nn.Sequential(
            self._create_block(3, 6, 15),
            self._create_block(6, 12, 7),
            self._create_block(12, 24, 5),
            self._create_block(24, 48, 3),
            self._create_block(48, 96, 3),
            self._create_block(96, 192, 3),
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
        )

        # Fully connected layer
        self.fc = nn.Linear(192, 4)

    def _create_block(self, num_in, num_out, filter_size):
        return nn.Sequential(
            nn.Conv2d(num_in, num_out, filter_size, 1, "valid", bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(num_out),
            nn.MaxPool2d(2, 2),
        )


[docs]
    def forward(
        self,
        imgs: list[torch.Tensor],
        targets: list[dict[str, torch.Tensor]] | None = None,
    ) -> dict[str, torch.Tensor] | list[dict[str, torch.Tensor]]:
        """Forward pass through the network.

        This takes a list of images and returns a list of predictions
        for each image or a loss dictionary if the targets are provided.
        This is to match the API of the PyTorch *torchvision* models,
        which specify that:

            "During training, returns a dictionary containing the
            classification and regression losses for each image in the
            batch. During inference, returns a list of dictionaries, one
            for each input image. Each dictionary contains the predicted
            boxes, labels, and scores for all detections in the image."

        Args:
            imgs (list[torch.Tensor]): A list of images.
            annotations (list[dict[str, torch.Tensor]], optional): A
                list of annotations for each image. Each annotation is a
                dictionary that contains:

                1. ``"boxes"``: the bounding boxes for each object
                2. ``"labels"``: labels
                   for all objects in the image. If ``None``, the
                   network is in inference mode.


        Returns:
            dict[str, torch.Tensor] | list[dict[str, torch.Tensor]]:
            A dictionary with only a single "regression" loss entry if
            ``targets`` were specified. Otherwise, a list of
            dictionaries with the predicted bounding boxes, labels, and
            scores for all detections in each image.
        """
        # Forward pass; insert a new dimension to indicate a single bbox
        preds = self.fc(self.features(torch.stack(imgs)))

        # Get width and height
        h, w = imgs[0].shape[-2:]

        # Convert to (x_min, y_min, x_max, y_max)
        preds[:, 0] = preds[:, 0] * w
        preds[:, 1] = preds[:, 1] * h
        preds[:, 2] = preds[:, 0] + preds[:, 2] * w
        preds[:, 3] = preds[:, 1] + preds[:, 3] * h

        if targets is None:
            # Clamp the coordinates to the image size
            preds[:, 0] = torch.clamp(preds[:, 0], 0, w)
            preds[:, 1] = torch.clamp(preds[:, 1], 0, h)
            preds[:, 2] = torch.clamp(preds[:, 2], 0, w)
            preds[:, 3] = torch.clamp(preds[:, 3], 0, h)

        # Convert to shape (N, 1, 4)
        preds = [*preds[:, None, :]]

        if targets is not None:
            return self.compute_loss(preds, targets, imgs[0].size()[-2:])
        else:
            return [
                {
                    "boxes": pred,
                    "labels": torch.ones(1, dtype=torch.int64, device=pred.device),
                    "scores": torch.ones(1, device=pred.device),
                }
                for pred in preds
            ]



[docs]
    def compute_loss(
        self,
        preds: list[torch.Tensor],
        targets: list[dict[str, torch.Tensor]],
        size: tuple[int, int],
    ) -> dict[str, torch.Tensor]:
        """Compute the loss for the predicted bounding boxes.

        This computes the MSE loss between the predicted bounding boxes
        and the target bounding boxes. The returned dictionary contains
        only one key: "regression".

        Args:
            preds (list[torch.Tensor]): A list of predicted bounding
                boxes for each image.
            targets (list[dict[str, torch.Tensor]]): A list of targets
                for each image.

        Returns:
            dict[str, torch.Tensor]: A dictionary with only one key:
            "regression" which contains the regression MSE loss.
        """
        # Initialize criterion, loss dictionary, and device
        criterion, loss_dict, device = nn.MSELoss(), {}, preds[0].device

        # Use to divide (x_min, y_min, x_max, y_max) by (w, h, w, h)
        size = torch.tensor([[*size[::-1], *size[::-1]]], device=device)

        for i, pred in enumerate(preds):
            # Compute the loss (normalize the coordinates before that)
            loss = criterion(pred / size, targets[i]["boxes"][:1] / size)
            loss_dict[i] = loss

        return loss_dict