import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from ._layers import LoadMixin, ContextPath, FeatureFusionModule, BiSeNetOutput

[docs]class BiSeNet(nn.Module, LoadMixin): """Face attribute parser. This class is capable of predicting scores for 19 attributes for face images. After it identifies the closest attribute for each pixel, it can also put the whole face image to a corresponding attribute or mask group. The 19 attributes are as follows (attributes are indicated from a person's face perspective, meaning, for instance, left eye is the eye on the right hand-side of the picture, however, sides are not always accurate): * 0 - neutral * 1 - skin * 2 - left eyebrow * 3 - right eyebrow * 4 - left eye * 5 - right eye * 6 - eyeglasses * 7 - left ear * 8 - right ear * 9 - earing * 10 - nose * 11 - mouth * 12 - upper lip * 13 - lower lip * 14 - neck * 15 - necklace * 16 - clothes * 17 - hair * 18 - hat Some examples of grouping by attributes: * ``'glasses': [6]`` - this will put each face image that contains pixels labeled as 6 to a category called 'glasses'. * ``'earings_and_necklace': [9, 15]`` - this will put each image that contains pixels labeled as 9 and also contains pixels labeled as 15 to a category called 'earings_and_necklace'. * ``'no_accessories': [-6, -9, -15, -18]`` - this will put each face image that does not contain pixels labeled as either 6, 9, 15, or 18 to a category called 'no_accessories'. Some examples of grouping by mask: * ``'nose': [10]`` - this will put each face image that contains pixels labeled as 10 to a category called 'nose' and generate a corresponding mask. * ``'eyes_and_eyebrows': [2, 3, 4, 5]`` - this will put each image that contains pixels labeled as either 2, 3, 4, or 5 (or any combination of them) to a category called 'eyes_and_eyebrows' and generate a corresponding mask. This class also inherits ``load`` method from ``LoadMixin`` class. The method takes a device on which to load the model and loads the model with a default state dictionary loaded from ``WEIGHTS_FILENAME`` file. It sets this model to eval mode and disables gradients. For more information on how BiSeNet model works, see this repo: `Face Parsing PyTorch <>`_. Most of the code was taken from that repository. Note: Whenever an input shape is mentioned, N corresponds to batch size, C corresponds to the number of channels, H - to input height, and W - to input width. Be default, this class initializes the following attributes which can be changed after initialization of the class (but, typically, should not be changed): Attributes: attr_join_by_and (bool): Whether to add a face image to an attribute group if the face meets all the specified attributes in a list (joined by and) of at least one of the attributes (joined by or). Please read the definition of `attr_groups` to get a clearer picture. In most cases, this should be set True - if the attributes in a group list are negative, this will ensure the selected face will match none of the specified attributes. Also, if you want to join the attributes by or (any), then separate single-attribute groups can be created and manually merged into one. Defaults to True. attr_threshold (int): Threshold, based on which the attribute is determined as present in the face image. For instance, if the threshold is 5, then at least 6 pixels must be labeled of the same kind of attribute for that attribute to be considered present in the face image. Defaults to 5. mask_threshold (int): Threshold, based on which the mask is considered to be a proper mask. For instance, if the threshold is 15, then face images for which the number of pixels with the values corresponding to a specified mask group (face attributes) is less than or equal to 15 will be ignored and image-mask pair for that mask category will not be generated. Defaults to 15. mean (list[float]): The list of mean values for each input channel. The pixel values should be shifted by those quantities during inference since this normalization was applied during training. Defaults to [0.485, 0.456, 0.406]. std (list[float]): The list of standard deviation values for each input channel. The pixel values should be scaled by those quantities during inference since this normalization was applied during training. Defaults to [0.229, 0.224, 0.225]. """ #: WEIGHTS_FILENAME (str): The constant specifying the name of #: ``.pth`` file from which the weights for this model should be #: loaded. Defaults to "bise_parser.pth". WEIGHTS_FILENAME = "bise_parser.pth"
[docs] def __init__( self, attr_groups: dict[str, list[int]] | None = None, mask_groups: dict[str, list[int]] | None = None, max_batch_size: int = 8, ): """Initializes BiSeNet model. First it assigns the passed values as attributes. Then this method initializes BiSeNet layers required for face parsing, i.e., labeling face parts. Note: Check class definition for the possible face attribute values and examples of groups. Also note that all the specified variables here are mainly relevant only for :meth:`predict`. Args: attr_groups: Dictionary specifying attribute groups, based on which the face images should be grouped. Each key represents an attribute group name, e.g., 'glasses', 'earings_and_necklace', 'no_accessories', and each value represents attribute indices, e.g., `[6]`, `[9, 15]`, `[-6, -9, -15, -18]`, each index mapping to some attribute. Since this model labels face image pixels, if there is enough pixels with the specified values in the list, the whole face image will be put into that attribute category. For negative values, it will be checked that the labeled face image does not contain those (absolute) values. If it is None, then there will be no grouping according to attributes. Defaults to None. mask_groups: Dictionary specifying mask groups, based on which the face images and their masks should be grouped. Each key represents a mask group name, e.g., 'nose', 'eyes_and_eyebrows', and each value represents attribute indices, e.g., `[10]`, `[2, 3, 4, 5]`, each index mapping to some attribute. Since this model labels face image pixels, a mask will be created with ones at pixels that match the specified attributes and zeros elsewhere. Note that negative values would make no sense here and having them would cause an error. If it is None, then there will be no grouping according to mask groups. Defaults to None. max_batch_size: The maximum batch size used when performing inference. There may be a lot of faces, in a single batch thus splitting to sub-batches for inference and then merging back predictions is a way to deal with memory errors. This is a convenience variable because batch size typically corresponds to the number of images for a single inference, but the input given in :meth:`predict` might have a larger batch size because it represents the number of faces, many of which can come from just a single image. Defaults to 8. """ super().__init__() # Initialize class attributes self.attr_groups = attr_groups self.mask_groups = mask_groups self.batch_size = max_batch_size self.attr_join_by_and = True self.attr_threshold = 5 self.mask_threshold = 10 self.mean = [0.485, 0.456, 0.406] self.std = [0.229, 0.224, 0.225] # Init model layers self.cp = ContextPath() self.ffm = FeatureFusionModule(256, 256) self.conv_out = BiSeNetOutput(256, 256, 19)
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor: """Performs forward pass. Takes an input batch and performs inference based on the modules it has. The input is a batch of face images and the output is a corresponding batch of pixel-wise attribute scores. Args: x: The input tensor of shape (N, 3, H, W). Returns: An output tensor of shape (N, 19, H, W) where each channel corresponds to a specific attribute and each value at (H, W) is an unbounded confidence score. """ # Generate final features from layers, upscale feat_out = self.conv_out(self.ffm(*self.cp(x))) return F.interpolate(feat_out, x.size()[2:], None, "bilinear", True)
[docs] def group_by_attributes( self, parse_preds: torch.Tensor, attr_groups: dict[str, list[int]], offset: int, ) -> dict[str, list[int]]: """Groups parse predictions by face attributes. Takes parse predictions for each face where each pixel corresponds to some attribute group (the integer value indicates that group) and extends the groups in attribute dictionary to include more samples that match the group. Args: parse_preds: Face parsing predictions of shape (N, H, W) with integer values indicating pixel categories. attr_groups: The dictionary with keys corresponding to attribute group names (they match ``self.attr_groups`` keys) and values corresponding to indices that map face images from other batches of ``parse_preds`` to the corresponding group. This is the dictionary that is extended and returned. offset: The offset to add to each index. Originally, the indices will correspond only to the face parsings in the current ``parse_preds`` batch and the offset allows to generalize the each index by offsetting it by the previous number of processes face parsings, i.e., the offset is the number of previous batches (``parse_preds``) times the batch size. Returns: Similar to ``attr_groups``, it is the dictionary with the same keys but values (which are lists of indices) may be extended with additional indices. """ # Specify function/criteria to join the attributes in a list att_join = torch.all if self.attr_join_by_and else torch.any for k, v in self.attr_groups.items(): # Get the list of face attributes per group and count pixels attr = torch.tensor(v, device=parse_preds.device).view(1, -1, 1, 1) is_attr = (parse_preds.unsqueeze(1) == attr.abs()).sum(dim=(2, 3)) # Compare each face against each attribute in a group is_attr = att_join(torch.stack([ is_attr[:, i] > self.attr_threshold if a > 0 else is_attr[:, i] <= self.attr_threshold for i, a in enumerate(v) ], dim=1), dim=1) # Add indices of those faces which match the group attribute inds = [i + offset for i in range(len(is_attr)) if is_attr[i]] attr_groups[k].extend(inds) return attr_groups
[docs] def group_by_masks( self, parse_preds: torch.Tensor, mask_groups: dict[str, tuple[list[int], list[np.ndarray]]], offset: int, ) -> dict[str, tuple[list[int], list[np.ndarray]]]: """Groups parse predictions by face mask attributes. Takes parse predictions for each face where each pixel corresponds to some parse/mask group (the integer value indicates that group) and extends the groups in mask dictionary to include more samples that match the group. Args: parse_preds: Face parsing predictions of shape (N, H, W) with integer values indicating pixel categories. mask_groups: The dictionary with keys corresponding to mask group names (they match ``self.mask_groups`` keys) and values corresponding to tuples where the first value is a list of indices that map face images from other batches of ``parse_preds`` to the corresponding group and the second is a list of corresponding masks as numpy arrays of shape (H, W) of type :attr:`numpy.uint8` with 255 at pixels that match the mask group specification and 0 elsewhere. This is the dictionary that is extended and returned. offset: The offset to add to each index. Originally, the indices will correspond only to the face parsings in the current ``parse_preds`` batch and the offset allows to generalize the each index by offsetting it by the previous number of processes face parsings, i.e., the offset is the number of previous batches (``parse_preds``) times the batch size. Returns: Similar to ``mask_groups``, it is the dictionary with the same keys but values (which are tuples of a list of indices and a list of masks) may be extended with additional indices and masks. """ # Retrieve threshold (shorter name) threshold = self.mask_threshold for k, v in self.mask_groups.items(): # Get the list of face attributes per group and check match attr = torch.tensor(v, device=parse_preds.device).view(1, -1, 1, 1) mask = (parse_preds.unsqueeze(1) == attr).any(dim=1) # Identify proper masks and convert them to numpy image type inds = [i for i in range(len(mask)) if mask[i].sum() > threshold] masks = mask[inds].mul(255).cpu().numpy().astype(np.uint8) # Extend the lists of indices and masks for k group mask_groups[k][0].extend([i + offset for i in inds]) mask_groups[k][1].extend([*masks]) return mask_groups
[docs] @torch.no_grad() def predict( self, images: torch.Tensor | list[torch.Tensor], ) -> tuple[dict[str, list[int]] | None, dict[str, tuple[list[int], list[np.ndarray]]] | None]: """Predicts attribute and mask groups for face images. This method takes a batch of face images groups them according to the specifications in ``self.attr_groups`` and ``self.mask_groups``. For more information on how it works, see this class' specification :class:`BiSeNet`. It returns 2 groups maps - one for grouping face images to different attribute categories, e.g., 'with glasses', 'no accessories' and the other for grouping images to different mask groups, e.g., 'nose', 'lips and mouth'. Args: images: Image batch of shape (N, 3, H, W) in RGB form with float values from 0.0 to 255.0. It must be on the same device as this model. A list of tensors can also be provided, however, they all must have the same spatial dimensions to be stack-able to a single batch. Returns: A tuple of 2 dictionaries (either can be None): 1. ``attr_groups`` - each key represents attribute category and each value is a list of indices indicating which samples from ``images`` batch belong to that category. It can be None if ``self.attr_groups`` is None. 2. `mask_groups` - each key represents attribute (mask) category and each value is a tuple where the first element is a list of indices indicating which samples from ``images`` batch belong to that mask group and the second element is a corresponding batch of masks of shape (N, H, W) of type :attr:`numpy.uint8` with values of either 0 or 255. The masks are presented in that order as the indices indicate which face images to take for that mask group. It can be None if ``self.mask_groups`` is None. """ # Initialize groups as None, a helper offset attr_groups, mask_groups, offset = None, None, 0 if self.attr_groups is not None: # Initialize an empty dictionary of attribute groups attr_groups = {k: [] for k in self.attr_groups.keys()} if self.mask_groups is not None: # Initialize an empty dictionary of mask groups mask_groups = {k: ([], []) for k in self.mask_groups.keys()} if isinstance(images, list): # Stack the list of tensors images = torch.stack(images) # Convert mean and std to tensors and reshape, resize images mean = torch.tensor(self.mean, device=images.device).view(1, 3, 1, 1) std = torch.tensor(self.std, device=images.device).view(1, 3, 1, 1) x = F.interpolate(images.div(255), (512, 512), mode="bilinear") for sub_x in torch.split(x, self.batch_size): # Inference and resize back o = self((sub_x - mean) / std) o = F.interpolate(o, images.size()[2:], mode="nearest").argmax(1) if self.attr_groups is not None: # Extend each attribute group based on predictions attr_groups = self.group_by_attributes(o, attr_groups, offset) if self.mask_groups is not None: # Extend each mask group based on predictions mask_groups = self.group_by_masks(o, mask_groups, offset) # Increment offset offset += len(sub_x) if attr_groups is not None: # Filter out groups for which the list of indices is empty attr_groups = {k: v for k, v in attr_groups.items() if len(v) > 0} if mask_groups is not None: # Filter out groups for which the list of indices is empty mask_groups = { k: (v[0], np.stack(v[1])) for k, v in mask_groups.items() if len(v[1]) > 0 } return attr_groups, mask_groups