Source code for face_crop_plus.models.retinaface

import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.models._utils as _utils
from ._layers import LoadMixin, PriorBox, SSH, FPN, Head


[docs]class RetinaFace(nn.Module, LoadMixin):
    """RetinaFace face detector and 5-point landmark predictor.

    This class is capable of predicting 5-point landmarks from a batch 
    of images and filter them based on strategy, e.g., "all landmarks in 
    the image", "a single set of landmarks per image of the largest
    face". For more information, see the main method of this class
    :meth:`predict`. For main attributes, see :meth:`__init__`.

    This class also inherits ``load`` method from ``LoadMixin`` class. 
    The method takes a device on which to load the model and loads the 
    model with a default state dictionary loaded from 
    ``WEIGHTS_FILENAME`` file. It sets this model to eval mode and 
    disables gradients.

    For more information on how RetinaFace model works, see this repo:
    `PyTorch Retina Face <https://github.com/biubug6/Pytorch_Retinaface>`_. 
    Most of the code was taken from that repository.

    Note:
        Whenever an input shape is mentioned, N corresponds to batch 
        size, C corresponds to the number of channels, H - to input
        height, and W - to input width. ``out_dim`` corresponds to the
        total guesses (the number of priors) the model made about each
        sample. Within those guesses, there typically exists at least 1 
        face but can be more. By default, it should be 43,008.
    
    Be default, this class initializes the following attributes which 
    can be changed after initialization of the class (but, typically, 
    should not be changed):

    Attributes:
        nms_threshold (float): The threshold, based on which 
            multiple bounding box or landmark predictions for the same 
            face are merged into one. Defaults to 0.4.
        variance (list[int]): The variance of the bounding boxes 
            used to undo the encoding of coordinates of raw  bounding 
            box and landmark predictions.
    """
    #: WEIGHTS_FILENAME (str): The constant specifying the name of 
    #: ``.pth`` file from which the weights for this model should be 
    #: loaded. Defaults to "retinaface_detector.pth".
    WEIGHTS_FILENAME = "retinaface_detector.pth"

[docs]    def __init__(self, strategy: str = "all", vis: float = 0.6):
        """Initializes RetinaFace model.            
        
        This method initializes ResNet-50 backbone and further 
        layers required for face detection and bbox/landm predictions.

        Args:
            strategy: The strategy used to retrieve the landmarks when
                :meth:`predict` is called. The available options are:

                    * "all" - landmarks for all faces per single image
                      (single batch entry) will be considered.
                    * "best" - landmarks for a single face with the
                      highest confidence score per image will be 
                      considered.
                    * "largest" - landmarks for a single largest face
                      per image will be considered.

                The most efficient option is 'best' and the least
                efficient is "largest". Defaults to "all".
            vis: The visual threshold, i.e., minimum confidence score,
                for a face to be considered an actual face. Lower
                scores will allow the detection of more faces per image
                but can result in non-actual faces, e.g., random
                surfaces somewhat representing faces. Higher scores will 
                prevent detecting faulty faces but may result in only a
                few faces detected, whereas there can be more, e.g., 
                higher will prevent the detection of blurry faces. 
                Defaults to 0.6.
        """
        super().__init__()

        # Initialize attributes
        self.strategy = strategy
        self.vis_threshold = vis
        self.nms_threshold = 0.4
        self.variance = [0.1, 0.2]

        # Set up backbone and config
        backbone = models.resnet50()
        in_channels, out_channels = 256, 256
        in_channels_list = [in_channels * x for x in [2, 4, 8]]
        return_layers = {'layer2': 1, 'layer3': 2, 'layer4': 3}

        # Construct the backbone by retrieving intermediate layers
        self.body = _utils.IntermediateLayerGetter(backbone, return_layers)

        # Construct sub-layers to extract features for heads
        self.fpn = FPN(in_channels_list, out_channels)
        self.ssh1 = SSH(out_channels, out_channels)
        self.ssh2 = SSH(out_channels, out_channels)
        self.ssh3 = SSH(out_channels, out_channels)

        # Construct 3 heads - score, bboxes & landms
        self.ClassHead = Head.make(2, out_channels)
        self.BboxHead = Head.make(4, out_channels)
        self.LandmarkHead = Head.make(10, out_channels)

[docs]    def forward(
        self,
        x: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Performs forward pass.

        Takes an input batch and performs inference based on the modules 
        it has. Returns an unfiltered tuple of scores, bounding boxes 
        and landmarks for all the possible detected faces. The 
        predictions are encoded to comfortably compute the loss during 
        training and thus should be decoded to coordinates.

        Args:
            x: The input tensor of shape (N, 3, H, W).

        Returns:
            A tuple of torch tensors where the first element is
            confidence scores for each prediction of shape
            (N, out_dim, 2) with values between 0 and 1 representing
            probabilities, the second element is bounding boxes of shape 
            (N, out_dim, 4) with unbounded values and the last element 
            is landmarks of shape (N, ``out_dim``, 10) with unbounded
            values.
        """
        # Extract FPN + SSH features
        fpn = self.fpn(self.body(x))
        fts = [self.ssh1(fpn[0]), self.ssh2(fpn[1]), self.ssh3(fpn[2])]

        # Create head list and use each to process feature list
        hs = [self.ClassHead, self.BboxHead, self.LandmarkHead]
        pred = [torch.cat([h[i](f) for i, f in enumerate(fts)], 1) for h in hs]
        
        return F.softmax(pred[0], dim=-1), pred[1], pred[2]
    
[docs]    def decode_bboxes(
        self,
        loc: torch.Tensor,
        priors: torch.Tensor,
    ) -> torch.Tensor:
        """Decodes bounding boxes from predictions.

        Takes the predicted bounding boxes (locations) and undoes the 
        encoding for offset regression used at training time.

        Args:
            loc: Bounding box (location) predictions for loc layers of
                shape (N, out_dim, 4). 
            priors: Prior boxes in center-offset form of shape
                (out_dim, 4).

        Returns:
            A tensor of shape (N, out_dim, 4) representing decoded
            bounding box predictions where the last dim can be
            interpreted as x1, y1, x2, y2 coordinates - the start and
            the end corners defining the face box.
        """
        # Concatenate priors
        boxes = torch.cat((
            priors[:, :2] + loc[..., :2] * self.variance[0] * priors[:, 2:],
            priors[:, 2:] * torch.exp(loc[..., 2:] * self.variance[1])
        ), 2)
        
        # Adjust values for proper xy coords
        boxes[..., :2] -= boxes[..., 2:] / 2
        boxes[..., 2:] += boxes[..., :2]

        return boxes

[docs]    def decode_landms(
        self,
        pre: torch.Tensor,
        priors: torch.Tensor,
    ) -> torch.Tensor:
        """Decodes landmarks from predictions.

        Takes the predicted landmarks (pre) and undoes the encoding for
        offset regression used at training time.

        Args:
            pre: Landmark predictions for loc layers of shape
                (N, out_dim, 10).
            priors: Prior boxes in center-offset form of shape
                (out_dim, 4).

        Returns:
            A tensor of shape (N, out_dim, 10) representing decoded
            landmark predictions where the last dim can be
            interpreted as x1, y1, ..., x10, y10 coordinates - one for 
            each of the 5 landmarks.
        """
        # Concatenate priors
        var = self.variance
        landms = torch.cat((
            priors[..., :2] + pre[..., :2] * var[0] * priors[..., 2:],
            priors[..., :2] + pre[..., 2:4] * var[0] * priors[..., 2:],
            priors[..., :2] + pre[..., 4:6] * var[0] * priors[..., 2:],
            priors[..., :2] + pre[..., 6:8] * var[0] * priors[..., 2:],
            priors[..., :2] + pre[..., 8:10] * var[0] * priors[..., 2:],
        ), dim=2)

        return landms
    
[docs]    def filter_preds(
        self,
        scores: torch.Tensor,
        bboxes: torch.Tensor,
        landms: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor, list[int]]:
        """Filters predictions for identified faces for each sample.
        
        This method works as follows:

            1. First, it filters out bad predictions based on
               ``self.vis_threshold``.
            2. Then it gathers all the remaining predictions across the
               batch dimension, i.e., the batch dimension becomes not
               the number of samples but the number of filtered out
               predictions.
            3. It loops for each set of filtered predictions per sample
               sorting each set of confidence scores from best to worst.
            4. For each set of confidence scores, it identifies distinct 
               faces and keeps the record of which indices to keep. At 
               this stage it uses ``self.nms_threshold`` to remove the 
               duplicate face predictions.
            5. Finally, it applies the kept indices for each person
               (each face) to select corresponding bounding boxes and
               landmarks.

        Args:
            scores: The confidence score predictions of shape
                (N, out_dim).
            bboxes: The bounding boxes for each face of shape
                (N, out_dim, 4) where the last 4 numbers correspond to
                start and end coordinates - x1, y1, x2, y2.
            landms: The landmarks for each face of shape
                (N, out_dim, num_landmarks * 2) where the last dim 
                corresponds to landmark coordinates x1, y1, ... . By
                default, num_landmarks is 5.

        Returns:
            A tuple where the first element is a torch tensor of shape
            (``num_faces``, 4), the second element is a torch tensor of
            shape (``num_faces``, ``num_landmarks`` * 2) and the third 
            element is a list of length ``num_faces``. First and second 
            elements correspond to bounding boxes and landmarks for each 
            face across all samples and the third element provides an 
            index for each bounding box/set of landmarks that identifies
            which sample that box/set (or that face) is extracted from
            (because each sample can have multiple faces).
        """
        # Init variables, identify masks to filter best faces
        cumsum, people_indices, sample_indices = 0, [], []
        masks = scores > self.vis_threshold

        # Flatten across batch filtered predictions, compute face areas
        scores, bboxes, landms = scores[masks], bboxes[masks], landms[masks]
        areas = (bboxes[:, 2]-bboxes[:, 0]+1) * (bboxes[:, 3]-bboxes[:, 1]+1)

        for i, num_valid in enumerate(masks.sum(dim=1)):
            # Extract all face preds for a single sample
            start, end, keep = cumsum, cumsum+num_valid, []
            bbox, area = bboxes[start:end], areas[start:end]
            scores_sorted = scores[start:end].argsort(descending=True)

            while scores_sorted.numel() > 0:
                # Append best face's index to keep
                keep.append(j := scores_sorted[0])
                
                # Find coordinates that at least bound the current face
                xy1 = torch.maximum(bbox[j, :2], bbox[scores_sorted[1:], :2])
                xy2 = torch.minimum(bbox[j, 2:], bbox[scores_sorted[1:], 2:])

                # Compute width and height for the current minimal face
                w = torch.maximum(torch.tensor(0.0), xy2[:, 0] - xy1[:, 0] + 1)
                h = torch.maximum(torch.tensor(0.0), xy2[:, 1] - xy1[:, 1] + 1)

                # Compute nms for identifying areas for the current face
                ovr = (a := w * h) / (area[j] + area[scores_sorted[1:]] - a)
                
                # Filter out current face, keep next best scores
                inds = torch.where(ovr <= self.nms_threshold)[0]
                scores_sorted = scores_sorted[inds + 1]
            
            # Update people and sample indices, increment cumsum
            people_indices.extend([cumsum + k for k in keep])
            sample_indices.extend([i] * len(keep))
            cumsum += num_valid
        
        # Select the final landms and bboxes
        bboxes = bboxes[people_indices, :]
        landms = landms[people_indices, :]
        
        return landms, bboxes, sample_indices
    
[docs]    def take_by_strategy(
        self,
        landms: torch.Tensor,
        bboxes: torch.Tensor,
        idx: list[int],
    ) -> tuple[torch.Tensor, list[int]]:
        """Filters landmarks according to strategy.

        This method takes a batch of landmarks and bounding boxes (one
        for each face) filters only specific landmarks by a specific
        strategy. Here are the following cases of strategy:

            * "all" - effectively, nothing is done and simply the
              already passed values `landms` and `idx` are returned 
              without any changes.
            * "best" - the very first set of landmarks for each image 
              image is returned (the first set is the best set because
              the landmarks were sorted when duplicates were filtered
              out in :meth:`filter_preds`). This means
              the returned indices list is unique, e.g., goes from 
              ``[0, 0, 0, 1, 1, 2, 3, 3]`` to ``[0, 1, 2, 3]``.
            * "largest" - similar to 'best', except that this strategy
              requires performing additional computation to find out the 
              largest face based on the area of bounding boxes. Thus the 
              length of the `idx` list (which is equal to the number of 
              sets of landmarks) is the same as for 'best' strategy,
              except not the first (best) faces (actually, their
              landmarks) for each image but selected faces are returned.

        Note:
            Strategy "best" is most memory efficient, strategy "largest" 
            is least time efficient. Strategy "all" is as fast as "best" 
            but takes up more space.

        Args:
            landms: Landmarks batch of shape 
                (``num_faces``, ``num_landm`` * 2).
            bboxes: Bounding boxes batch of shape (``num_faces``, 4).
            idx: Indices where each index maps to an image from
                which some face prediction (landmarks and bounding box) 
                was retrieved. For instance if the 2nd element of idx is 
                1, that means that the 2nd element of ``landms`` and the
                2nd element of ``bboxes`` correspond to the 1st image. 
                This list is ascending, meaning the elements are
                grouped and increase, for example, the list may look
                like this: ``[0, 0, 1, 2, 3, 3, 3, 3, 4, 4, 5, 6, 6]``.

        Raises:
            ValueError: If the strategy is not supported.

        Returns:
            A tuple where the first element is torch tensor of shape 
            (``num_faces``, ``num_landm`` * 2) representing the selected 
            sets of landmarks and the second element is a list of 
            indices where each index maps a corresponding set of 
            landmarks (face) to an image identified by that index.
        """
        if len(idx) == 0:
            # If no predicted landmarks, return empty lists
            return torch.tensor([], device=landms.device), []
        
        # Init helper variables
        landmarks, indices = [], []
        cache = {"idx": [], "bboxes": [], "landms": []}

        for i in range(len(idx)):
            # Apend everything to cache
            cache["idx"].append(idx[i])
            cache["bboxes"].append(bboxes[i])
            cache["landms"].append(landms[i])

            if i != len(idx) - 1 and cache["idx"][-1] == idx[i + 1]:
                # No operations until cache for current idx is full
                continue

            match self.strategy:
                case "all":
                    # Append all landmarks and indices
                    landmarks.extend(cache["landms"])
                    indices.extend(cache["idx"])
                case "best":
                    # Append the first set of landmarks
                    landmarks.append(cache["landms"][0])
                    indices.append(cache["idx"][0])
                case "largest":
                    # Compute bounding box areas
                    bbs = torch.stack(cache["bboxes"])
                    areas = (bbs[:, 2] - bbs[:, 0] + 1) *\
                            (bbs[:, 3] - bbs[:, 1] + 1)

                    # Append only the largest face landmarks and its idx
                    landmarks.append(cache["landms"][areas.argmax()])
                    indices.append(cache["idx"][0])
                case _:
                    raise ValueError(f"Unsupported startegy: {self.strategy}")
            
            # Clear cache (reinitialize empty lists)
            cache = {k: [] for k in cache.keys()}

        # Stack landmarks across batch dim
        landmarks = torch.stack(landmarks)
    
        return landmarks, indices
    
[docs]    @torch.no_grad()
    def predict(self, images: torch.Tensor) -> tuple[np.ndarray, list[int]]:
        """Predict the sets of landmarks from the image batch.

        This method takes a batch of images, detect all visible faces, 
        predicts bounding boxes and landmarks for each face and then 
        filters those faces according to a specific strategy - see
        :meth:`take_by_strategy` for more info. Finally, it returns 
        those selected sets of landmarks and corresponding indices that 
        map each set to a specific image where the face was originally 
        detected.

        The predicted sets of landmarks are 5-point coordinates (they  
        are specified from an observer's viewpoint, meaning that, for 
        instance, left eye is the eye on the left hand-side of the image 
        rather than the left eye from the person's to whom the eye 
        belongs perspective):

            1. **(x1, y1)** - coordinate of the left eye
            2. **(x2, y2)** - coordinate of the right eye
            3. **(x3, y3)** - coordinate of the nose tip
            4. **(x4, y4)** - coordinate of the left mouth corner
            5. **(x5, y5)** - coordinate of the right mouth corner

        The coordinates are with respect to the sizes of the images 
        (typically padded) provided as an input to this method.

        Args:
            images: Image batch of shape (N, 3, H, W) in RGB form with 
                float values from 0.0 to 255.0. It must be on the same 
                device as this model.

        Returns:
            A tuple where the first element is a numpy array of shape 
            (``num_faces``, 5, 2) representing the selected sets of 
            landmark coordinates and the second element is a list of
            corresponding indices mapping each face to an image it comes
            from.
        """
        # Convert images to appropriate input and perform inference
        x, offset = images[:, [2, 1, 0]], torch.tensor([104, 117, 123])
        scores, bboxes, landms = self(x - offset.view(3, 1, 1).to(x.device))

        # Create prior boxes and scale factors to decode bboxes & landms
        priors = PriorBox((x.size(2), x.size(3))).forward().to(x.device)
        scale_b = torch.tensor([x.size(3), x.size(2)] * 2, device=x.device)
        scale_l = torch.tensor([x.size(3), x.size(2)] * 5, device=x.device)

        # Decode the predictions
        scores = scores[..., 1]
        bboxes = self.decode_bboxes(bboxes, priors) * scale_b
        landms = self.decode_landms(landms, priors) * scale_l

        # Filter out bad predictions, then filter by strategy
        filtered = self.filter_preds(scores, bboxes, landms)
        landmarks, indices = self.take_by_strategy(*filtered)

        # Stack landmarks across batch dim and reshape as coords
        landmarks = landmarks.view(-1, 5, 2).cpu().numpy()

        return landmarks, indices