Source code for glasses_detector.architectures.tiny_binary_detector

import torch
import torch.nn as nn


[docs] class TinyBinaryDetector(nn.Module): """Tiny binary detector. This is a custom detector created with the aim to contain very few parameters while maintaining a reasonable accuracy. It only has several sequential convolutional and pooling blocks (with batch-norm in between). Note: I tried varying the architecture, including activations, convolution behavior (groups and stride), pooling, and layer structure. This also includes residual and dense connections, as well as combinations. Turns out, they do not perform as well as the current architecture which is just a bunch of CONV-RELU-BN-MAXPOOL blocks with no paddings. """ def __init__(self): super().__init__() # Several convolutional blocks self.features = nn.Sequential( self._create_block(3, 6, 15), self._create_block(6, 12, 7), self._create_block(12, 24, 5), self._create_block(24, 48, 3), self._create_block(48, 96, 3), self._create_block(96, 192, 3), nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(), ) # Fully connected layer self.fc = nn.Linear(192, 4) def _create_block(self, num_in, num_out, filter_size): return nn.Sequential( nn.Conv2d(num_in, num_out, filter_size, 1, "valid", bias=False), nn.ReLU(), nn.BatchNorm2d(num_out), nn.MaxPool2d(2, 2), )
[docs] def forward( self, imgs: list[torch.Tensor], targets: list[dict[str, torch.Tensor]] | None = None, ) -> dict[str, torch.Tensor] | list[dict[str, torch.Tensor]]: """Forward pass through the network. This takes a list of images and returns a list of predictions for each image or a loss dictionary if the targets are provided. This is to match the API of the PyTorch *torchvision* models, which specify that: "During training, returns a dictionary containing the classification and regression losses for each image in the batch. During inference, returns a list of dictionaries, one for each input image. Each dictionary contains the predicted boxes, labels, and scores for all detections in the image." Args: imgs (list[torch.Tensor]): A list of images. annotations (list[dict[str, torch.Tensor]], optional): A list of annotations for each image. Each annotation is a dictionary that contains: 1. ``"boxes"``: the bounding boxes for each object 2. ``"labels"``: labels for all objects in the image. If ``None``, the network is in inference mode. Returns: dict[str, torch.Tensor] | list[dict[str, torch.Tensor]]: A dictionary with only a single "regression" loss entry if ``targets`` were specified. Otherwise, a list of dictionaries with the predicted bounding boxes, labels, and scores for all detections in each image. """ # Forward pass; insert a new dimension to indicate a single bbox preds = self.fc(self.features(torch.stack(imgs))) # Get width and height h, w = imgs[0].shape[-2:] # Convert to (x_min, y_min, x_max, y_max) preds[:, 0] = preds[:, 0] * w preds[:, 1] = preds[:, 1] * h preds[:, 2] = preds[:, 0] + preds[:, 2] * w preds[:, 3] = preds[:, 1] + preds[:, 3] * h if targets is None: # Clamp the coordinates to the image size preds[:, 0] = torch.clamp(preds[:, 0], 0, w) preds[:, 1] = torch.clamp(preds[:, 1], 0, h) preds[:, 2] = torch.clamp(preds[:, 2], 0, w) preds[:, 3] = torch.clamp(preds[:, 3], 0, h) # Convert to shape (N, 1, 4) preds = [*preds[:, None, :]] if targets is not None: return self.compute_loss(preds, targets, imgs[0].size()[-2:]) else: return [ { "boxes": pred, "labels": torch.ones(1, dtype=torch.int64, device=pred.device), "scores": torch.ones(1, device=pred.device), } for pred in preds ]
[docs] def compute_loss( self, preds: list[torch.Tensor], targets: list[dict[str, torch.Tensor]], size: tuple[int, int], ) -> dict[str, torch.Tensor]: """Compute the loss for the predicted bounding boxes. This computes the MSE loss between the predicted bounding boxes and the target bounding boxes. The returned dictionary contains only one key: "regression". Args: preds (list[torch.Tensor]): A list of predicted bounding boxes for each image. targets (list[dict[str, torch.Tensor]]): A list of targets for each image. Returns: dict[str, torch.Tensor]: A dictionary with only one key: "regression" which contains the regression MSE loss. """ # Initialize criterion, loss dictionary, and device criterion, loss_dict, device = nn.MSELoss(), {}, preds[0].device # Use to divide (x_min, y_min, x_max, y_max) by (w, h, w, h) size = torch.tensor([[*size[::-1], *size[::-1]]], device=device) for i, pred in enumerate(preds): # Compute the loss (normalize the coordinates before that) loss = criterion(pred / size, targets[i]["boxes"][:1] / size) loss_dict[i] = loss return loss_dict