Source code for face_crop_plus.cropper

import os
import cv2
import tqdm
import torch
import numpy as np

from functools import partial
from collections import defaultdict
from multiprocessing.pool import ThreadPool

from .models import BiSeNet
from .models import RRDBNet
from .models import RetinaFace

from .utils import (
    STANDARD_LANDMARKS_5, 
    parse_landmarks_file, 
    get_ldm_slices, 
    as_numpy, 
    as_tensor,
    read_images,
    as_batch,
)


[docs]class Cropper(): """Face cropper class with bonus features. This class is capable of automatically aligning and center-cropping faces, enhancing image quality and grouping the extracted faces according to specified face attributes, as well as generating masks for those attributes. Capabilities ------------ This class has the following 3 main features: 1. **Face cropping** - automatic face alignment and cropping based on landmarks. The landmarks can either be predicted via face detection model (see :class:`.RetinaFace`) or they can be provided as txt, csv, json etc. file. It is possible to control face factor in the extracted images and strategy of extraction (e.g., largest face, all faces per image). 2. **Face enhancement** - automatic quality enhancement of images where the relative face area is small. For instance, there may be images with many faces, but the quality of those faces, if zoomed in, is low. Quality enhancement feature allows to remove the blurriness. It can also enhance the quality of every image, if desired (see :class:`.RRDBNet`). 3. **Face parsing** - automatic face attribute parsing and grouping to sub-directories according selected attributes. Attributes can indicate to group faces that contain specific properties, e.g., "earrings and necklace", "glasses". They can also indicate what properties the faces should not include to form a group, e.g., "no accessories" group would indicate to include faces without hats, glasses, earrings, necklace etc. It is also possible to generate masks for selected face attributes, e.g., "glasses", "eyes and eyebrows". For more intuition on how grouping works, see :class:`.BiSeNet` and :meth:`save_groups`. The class is designed to perform all or some combination of the functions in one go, however, each feature is independent of one another and can work one by one. For example, it is possible to first extract all the faces in some output directory, then apply quality enhancement for every face to produce better quality faces in another output directory and then apply face parsing to group faces into different sub-folders according to some common attributes in a final output directory. It is possible to configure the number of processing units and the batch size for significant speedups., if the hardware allows. Examples -------- Command line example >>> python face_crop_plus -i path/to/images -o path/to/out/dir Auto face cropping (with face factor) and quality enhancement: >>> cropper = Cropper(face_factor=0.7, enh_threshold=0.01) >>> cropper.process_dir(input_dir="path/to/images") Very fast cropping with already known landmarks (no enhancement): >>> cropper = Cropper(landmarks="path/to/landmarks.txt", num_processes=24, enh_threshold=None) >>> cropper.process_dir(input_dir="path/to/images") Face cropping to attribute groups to custom output dir: >>> attr_groups = {"glasses": [6], "no_glasses_hats": [-6, -18]} >>> cropper = Cropper(attr_groups=attr_groups) >>> inp, out = "path/to/images", "path/to/parent/out/dir" >>> cropper.process_dir(input_dir=inp, output_dir=out) Face cropping and grouping by face attributes (+ generating masks): >>> groups = {"glasses": [6], "eyes_and_eyebrows": [2, 3, 4, 5]} >>> cropper = Cropper(output_format="png", mask_groups=groups) >>> cropper.process_dir("path/to/images") For grouping by face attributes, see documented face attribute indices in :class:`.BiSeNet`. Class Attributes ---------------- For how to initialize the class and to understand its functionality better, please refer to class attributes initialized via :meth:`__init__`. Here, further class attributes are described automatically initialized via :meth:`_init_models` and :meth:`_init_landmarks_target`. Attributes: det_model (RetinaFace): Face detection model (:class:`torch.nn.Module`) that is capable of detecting faces and predicting landmarks used for face alignment. See :class:`.RetinaFace`. enh_model (RRDBNet): Image quality enhancement model (torch.nn.Module) that is capable of enhancing the quality of images with faces. It can automatically detect which faces to enhance based on average face area in the image, compared to the whole image area. See :class:`.RRDBNet`. par_model (BiSeNet): Face parsing model (torch.nn.Module) that is capable of classifying pixels according to specific face attributes, e.g., "left_eye", "earring". It is able to group faces to different groups and generate attribute masks. See :class:`.BiSeNet`. landmarks_target (numpy.ndarray): Standard normalized landmarks of shape (``self.num_std_landmarks``, 2). These are scaled by ``self.face_factor`` and used as ideal landmark coordinates for the extracted faces. In other words, they are reference landmarks used to estimate the transformation of an image based on some actual set of face landmarks for that image. """
[docs] def __init__( self, output_size: int | tuple[int, int] | list[int] = 256, output_format: str | None = None, resize_size: int | tuple[int, int] | list[int] = 1024, face_factor: float = 0.65, strategy: str = "largest", padding: str = "constant", allow_skew: bool = False, landmarks: str | tuple[np.ndarray, np.ndarray] | None = None, attr_groups: dict[str, list[int]] | None = None, mask_groups: dict[str, list[int]] | None = None, det_threshold: float | None = 0.6, enh_threshold: float | None = None, batch_size: int = 8, num_processes: int = 1, device: str | torch.device = "cpu", ): """Initializes the cropper. Initializes class attributes. Args: output_size: The output size (width, height) of cropped image faces. If provided as a single number, the same value is used for both width and height. Defaults to 256. output_format: The output format of the saved face images. For available options, see `OpenCV imread <https://docs.opencv.org/4.x/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56>`_. If not specified, then the same image extension will not be changed, i.e., face images will be of the same format as the images from which they are extracted. Defaults to None. resize_size: The interim size (width, height) each image should be resized to before processing images. This is used to resize images to a common size to allow to make a batch. It should ideally be the mean width and height of all the images to be processed (but can simply be a square). Images will be resized to to the specified size while maintaining the aspect ratio (one of the dimensions will always match either the specified width or height). The shorter dimension would afterwards be padded - for more information on how it works, see :func:`.utils.create_batch_from_files`. Defaults to 1024. face_factor: The fraction of the face area relative to the output image. Defaults to 0.65. strategy: The strategy to use to extract faces from each image. The available options are: * "all" - all faces will be extracted form each image. * "best" - one face with the largest confidence score will be extracted from each image. * "largest" - one face with the largest face area will be extracted from each image. For more info, see :meth:`.RetinaFace.__init__`. Defaults to "largest". padding: The padding type (border mode) to apply when cropping out faces. If faces are near edge, some part of the resulting center-cropped face image may be blank, in which case it can be padded with specific values. For available options, see `OpenCV BorderTypes <https://docs.opencv.org/3.4/d2/de8/group__core__array.html#ga209f2f4869e304c82d07739337eae7c5>`_. If specified as "constant", the value of 0 will be used. Defaults to "reflect". allow_skew: Whether to allow skewing when aligning the face according to its landmarks. If True, then facial points will be matched very closely to the ideal standard landmark points (which is a set of reference points created internally when preforming the transformation). If all faces face forward, i.e., in portrait-like manner, then this could be set to True which results in minimal perspective changes. However, most of the time this should be set to False to preserve the face perspective. For more details, see :meth:`.crop_align`. Defaults to False. landmarks: If landmarks are already known, they should be specified via this variable. If specified, landmark estimation will not be performed. There are 2 ways to specify landmarks: 1. As a path to landmarks file, in which case str should be provided. The specified file should contain file (image) names and corresponding landmark coordinates. Duplicate file names are allowed (in case multiple faces are present in the same image). For instance, it could be .txt file where each row contains space-separated values: the first value is the file name and the other 136 values represent landmark coordinates in x1, y1, x2, y2, ... format. For more details about the possible file formats and how they are parsed, see :func:`~.utils.parse_landmarks_file`. 2. As a tuple of 2 numpy arrays. The first one is of shape (``num_faces``, ``num_landm``, 2) of type :attr:`numpy.float32` and represents the landmarks of every face that is going to be extracted from images. The second is a numpy array of shape (``num_faces``,) of type :class:`numpy.str_` where each value specifies a file name to which a corresponding set of landmarks belongs. If not specified, 5 landmark coordinates will be estimated for each face automatically. Defaults to None. attr_groups: Attribute groups dictionary that specifies how to group the output face images according to some common attributes. The keys are names describing some common attribute, e.g., "glasses", "no_accessories" and the values specify which attribute indices belong (or don't belong, if negative) to that group, e.g., [6], [-6, -9, -15]. For more information, see :class:`.BiSeNet` and :meth:`save_groups`. If not provided, output images will not be grouped by attributes and no attribute sub-folders will be created in the desired output directory. Defaults to None. mask_groups: Mask groups dictionary that specifies how to group the output face images according to some face attributes that make up a segmentation mask. The keys are mask type names, e.g., "eyes", and the values specify which attribute indices should be considered for that mask, e.g., [4, 5]. For every group, not only face images will be saved in a corresponding sub-directory, but also black and white face attribute masks (white pixels indicating the presence of a mask attribute). For more details, see For more info, see :class:`.BiSeNet` and :py:meth:`save_groups`. If not provided, no grouping is applied. Defaults to None. det_threshold: The visual threshold, i.e., minimum confidence score, for a detected face to be considered an actual face. See :meth:`.RetinaFace.__init__` for more details. If None, no face detection will be performed. Defaults to 0.6. enh_threshold: Quality enhancement threshold that tells when the image quality should be enhanced (it is an expensive operation). It is the minimum average face factor, i.e., face area relative to the image, below which the whole image is enhanced. It is advised to set this to a low number, like 0.001 - very high fractions might unnecessarily cause the image quality to be improved. Defaults to None. batch_size: The batch size. It is the maximum number of images that can be processed by every processor at a single time-step. Large values may result in memory errors, especially, when GPU acceleration is used. Increase this if less models (i.e., landmark detection, quality enhancement, face parsing models) are used and decrease otherwise. Defaults to 8. num_processes: The number of processes to launch to perform image processing. Each process works in parallel on multiple threads, significantly increasing the performance speed. Increase if less prediction models are used and increase otherwise. Defaults to 1. device: The device on which to perform the predictions, i.e., landmark detection, quality enhancement and face parsing. If landmarks are provided, no enhancement and no parsing is desired, then this has no effect. Defaults to "cpu". """ # Init specified attributes self.output_size = output_size self.output_format = output_format self.resize_size = resize_size self.face_factor = face_factor self.strategy = strategy self.padding = padding self.allow_skew = allow_skew self.landmarks = landmarks self.attr_groups = attr_groups self.mask_groups = mask_groups self.det_threshold = det_threshold self.enh_threshold = enh_threshold self.batch_size = batch_size self.num_processes = num_processes self.device = device # The only option for STD self.num_std_landmarks = 5 # Modify attributes to have proper type if isinstance(self.output_size, int): self.output_size = (self.output_size, self.output_size) if len(self.output_size) == 1: self.output_size = (self.output_size[0], self.output_size[0]) if isinstance(self.resize_size, int): self.resize_size = (self.resize_size, self.resize_size) if len(self.resize_size) == 1: self.resize_size = (self.resize_size[0], self.resize_size[0]) if isinstance(self.device, str): self.device = torch.device(device) if isinstance(self.landmarks, str): self.landmarks = parse_landmarks_file(self.landmarks) # Further attributes self._init_models() self._init_landmarks_target()
[docs] def _init_models(self): """Initializes detection, enhancement and parsing models. The method initializes 3 models: 1. If ``self.det_threshold`` is provided and no landmarks are known in advance, the detection model is initialized to estimate 5-point landmark coordinates. For more info, see :class:`.RetinaFace`. 2. If ``self.enh_threshold`` is provided, the quality enhancement model is initialized. For more info, see :class:`.RRDBNet`. 3. If ``self.attr_groups`` or ``self.mask_groups`` is provided, then face parsing model is initialized. For more info, see :class:`.BiSeNet`. Note: This is a useful initializer function if multiprocessing is used, in which case copies of all the models can be created on separate cores. """ # Init models as None self.det_model = None self.enh_model = None self.par_model = None if torch.cuda.is_available() and self.device.index is not None: # Helps to prevent CUDA memory errors torch.cuda.set_device(self.device.index) torch.cuda.empty_cache() if self.det_threshold is not None and self.landmarks is None: # If detection threshold is set, we will predict landmarks self.det_model = RetinaFace(self.strategy, self.det_threshold) self.det_model.load(device=self.device) if self.enh_threshold is not None: # If enhancement threshold is set, we might enhance quality self.enh_model = RRDBNet(self.enh_threshold) self.enh_model.load(device=self.device) if self.attr_groups is not None or self.mask_groups is not None: # If grouping by attributes or masks is set, use parse model args = (self.attr_groups, self.mask_groups, self.batch_size) self.par_model = BiSeNet(*args) self.par_model.load(device=self.device)
[docs] def _init_landmarks_target(self): """Initializes target landmarks set. This method initializes a set of standard landmarks. Standard, or target, landmarks refer to an average set of landmarks with ideal normalized coordinates for each facial point. The source facial points will be rotated, scaled and translated to match the standard landmarks as close as possible. Both source (computed separately for each image) and target landmarks must semantically match, e.g., the left eye coordinate in target landmarks also corresponds to the left eye coordinate in source landmarks. There should be a standard landmarks set defined for a desired number of landmarks. Each coordinate in that set is normalized, i.e., x and y values are between 0 and 1. These values are then scaled based on face factor and resized to match the desired output size as defined by ``self.output_size``. Note: Currently, only 5 standard landmarks are supported. Raises: ValueError: If the number of standard landmarks is not supported. The number of standard landmarks is ``self.num_std_landmarks``. """ match self.num_std_landmarks: case 5: # If the number of std landmarks is 5 std_landmarks = STANDARD_LANDMARKS_5.copy() case _: # Otherwise the number of STD landmarks is not supported raise ValueError(f"Unsupported number of standard landmarks " f"for estimating alignment transform matrix: " f"{self.num_std_landmarks}.") # Apply appropriate scaling based on face factor and out size std_landmarks[:, 0] *= self.output_size[0] * self.face_factor std_landmarks[:, 1] *= self.output_size[1] * self.face_factor # Add an offset to standard landmarks to center the cropped face std_landmarks[:, 0] += (1 - self.face_factor) * self.output_size[0] / 2 std_landmarks[:, 1] += (1 - self.face_factor) * self.output_size[1] / 2 # Pass STD landmarks as target landms self.landmarks_target = std_landmarks
[docs] def crop_align( self, images: np.ndarray | list[np.ndarray], padding: np.ndarray | None, indices: list[int], landmarks_source: np.ndarray, ) -> np.ndarray: """Aligns and center-crops faces based on the given landmarks. This method takes a batch of images (can be padded), and loops through each image (represented as a numpy array) performing the following actions: 1. Removes the padding. 2. Estimates affine transformation from source landmarks to standard landmarks. 3. Applies transformation to align and center-crop the face based on the face factor. 4. Returns a batch of face images represented as numpy arrays of the same length ans the number of sets of landmarks. Crucial role in this method plays ``self.landmarks_target`` which is the standard set of landmarks used as a reference for the source landmarks. Target and source landmark sets are used to estimate transformations of images - each image to which a set of landmarks (from source landmarks batch) belongs is transformed such that the area covers the those landmarks as the standard (target) landmarks set (as ideally as possible). For more details about target landmarks, check :meth:`_init_landmarks_target`. Note: If ``self.allow_skew`` is set to True, then facial points will also be skewed to match ``self.landmarks_target`` as close as possible (resulting in, e.g., longer/flatter faces than in the original images). Args: images: Image batch of shape (N, H, W, 3) of type :attr:`numpy.uint8` (doesn't matter if RGB or BGR) where each nth image is transformed to extract face(-s). (H, W) should be ``self.resize_size``. It can also be a list of :attr:`numpy.uint8` numpy arrays of different shapes. padding: Padding of shape (N, 4) where the integer values correspond to the number of pixels padded from each side: top, bottom, left, right. Padding was originally applied to each image, e.g., to make the image square, so that all images could be stacked as a batch. Therefore, it is needed here to remove the padding. If specified as None, it is assumed that the images are un-padded. indices: Indices list of length num_faces where each index specifies which image is used to extract faces for each set of landmarks in ``landmarks_source``. landmarks_source: Landmarks batch of shape (num_faces, ``self.num_std_landmarks``, 2). These are landmark sets of all the desired faces to extract from the given batch of N images. Returns: A batch of aligned and center-cropped faces where the factor of the area of a face relative to the whole face image area is ``self.face_factor``. The output is a numpy array of shape (N, H, W) of type :attr:`numpy.uint8` (same channel structure as for the input images). (H, W) is defined by ``self.output_size``. """ # Init list, border mode transformed_images = [] border_mode = getattr(cv2, f"BORDER_{self.padding.upper()}") for landmarks_idx, image_idx in enumerate(indices): if self.allow_skew: # Perform full perspective transformation transform_function = cv2.estimateAffine2D else: # Preform only rotation, scaling and translation transform_function = cv2.estimateAffinePartial2D # Estimate transformation matrix to apply transform_matrix = transform_function( landmarks_source[landmarks_idx], self.landmarks_target, ransacReprojThreshold=np.inf, )[0] if transform_matrix is None: # Could not estimate continue # Retrieve current image image = images[image_idx] if padding is not None: # Crop out the un-padded area [t, b, l, r] = padding[image_idx] image = image[t:image.shape[0]-b, l:image.shape[1]-r] # Apply affine transformation to the image transformed_images.append(cv2.warpAffine( image, transform_matrix, self.output_size, borderMode=border_mode )) # Normally stacking would be applied unless the list is empty numpy_fn = np.stack if len(transformed_images) > 0 else np.array return numpy_fn(transformed_images)
[docs] def save_group( self, faces: np.ndarray, file_names: list[str], output_dir: str, ): """Saves a group of images to output directory. Takes in a batch of faces or masks as well as corresponding file names from where the faces were extracted and saves the faces/masks to a specified output directory with the same names as those image files (appends counter suffixes if multiple faces come from the same file). If the batch of face images/masks is empty, then the output directory is not created either. Args: faces: Face images (cropped and aligned) represented as a numpy array of shape (N, H, W, 3) with values of type :attr:`numpy.uint8` ranging from 0 to 255. It may also be face mask of shape (N, H, W) with values of 255 where some face attribute is present and 0 elsewhere. file_names: The list of filenames of length N. Each face comes from a specific file whose name is also used to save the extracted face. If ``self.strategy`` allows multiple faces to be extracted from the same file, such as "all", counters at the end of filenames are added. output_dir: The output directory to save ``faces``. """ if len(faces) == 0: # Just return return # Create output directory, name counts os.makedirs(output_dir, exist_ok=True) file_name_counts = defaultdict(lambda: -1) for face, file_name in zip(faces, file_names): # Split each filename to base name, ext name, ext = os.path.splitext(file_name) if self.output_format is not None: # If specific img format given ext = '.' + self.output_format if self.strategy == "all": # Attach numbering to filenames file_name_counts[file_name] += 1 name += f"_{file_name_counts[file_name]}" if face.ndim == 3: # If it's a colored img (not a mask), to BGR face = cv2.cvtColor(face, cv2.COLOR_RGB2BGR) # Make image path based on file format and save file_path = os.path.join(output_dir, name + ext) cv2.imwrite(file_path, face)
[docs] def save_groups( self, faces: np.ndarray, file_names: np.ndarray, output_dir: str, attr_groups: dict[str, list[int]] | None, mask_groups: dict[str, tuple[list[int], np.ndarray]] | None, ): """Saves images (and masks) group-wise. This method takes a batch of face images of equal dimensions, a batch of file names identifying which image each face comes from, and, optionally, attribute and/or mask groups telling how to split the face images (and masks) across different folders. This method then loops through all the groups and saves images accordingly. Example 1: If neither ``attr_groups`` nor ``mask_groups`` are provided, the face images will be saved according to this structure:: ├── output_dir | ├── face_image_0.jpg | ├── face_image_1.png | ... Example 2: If only ``attr_groups`` is provided (keys are names describing common attributes across faces in that group and they are also sub-directories of ``output_dir``), the structure is as follows:: ├── output_dir | ├── attribute_group_1 | | ├── face_image_0.jpg | | ├── face_image_1.png | | ... | ├── attribute_group_2 | ... Example 3: If only ``mask_groups`` is provided (keys are names describing the mask type and they are also sub-directories of ``output_dir``), the structure is as follows:: ├── output_dir | ├── group_1 | | ├── face_image_0.jpg | | ├── face_image_1.png | | ... | ├── group_1_mask | | ├── face_image_0.jpg | | ├── face_image_1.png | | ... | ├── group_2 | | ... | ├── group_2_mask | | ... | ... Example 4: If both ``attr_groups`` and ``mask_groups`` are provided, then all images and masks will first be grouped by attributes and then by mask groups. The structure is then as follows:: ├── output_dir | ├── attribute_group_1 | | ├── group_1_mask | | | ├── face_image_0.jpg | | | ├── face_image_1.png | | | ... | | ├── group_1_mask | | | ├── face_image_0.jpg | | | ├── face_image_1.png | | | ... | | ├── group_2 | | | ... | | ├── group_2_mask | | | ... | | ... | | | ├── attribute_group_2 | | ... | ... Args: faces: Face images (cropped and aligned) represented as a numpy array of shape (N, H, W, 3) with values of type :attr:`numpy.uint8` ranging from 0 to 255. file_names: File names of images from which the faces were extracted. This value is a numpy array of shape (N,) with values of type :class:`numpy.str_`. Each nth face in ``faces`` maps to exactly one file nth name in this array, thus there may be duplicate file names (because different faces may come from the same file). output_dir: The output directory where the faces or folders of faces will be saved to. attr_groups: Face groups by attributes. Each key represents the group name (describes common attributes across faces) and each value is a list of indices identifying faces (from `faces`) that should go to that group. mask_groups: Face groups by extracted masks. Each key represents group name (describes the mask type) and each value is a tuple where the first element is a list of indices identifying faces (from ``faces``) that should go to that group and the second element is a batch of masks corresponding to indexed faces represented as numpy arrays of shape (N, H, W) with values of type :attr:`numpy.uint8` and being either 0 (negative) or 255 (positive). """ if attr_groups is None: # No-name group of idx mapping to all faces attr_groups = {'': list(range(len(faces)))} if mask_groups is None: # No-name group mapping to all faces, with no masks mask_groups = {'': (list(range(len(faces))), None)} for attr_name, attr_indices in attr_groups.items(): for mask_name, (mask_indices, masks) in mask_groups.items(): # Make mask group values that fall under attribute group group_idx = list(set(attr_indices) & set(mask_indices)) group_dir = os.path.join(output_dir, attr_name, mask_name) # Retrieve group values & save face_group = [faces[idx] for idx in group_idx] file_name_group = file_names[group_idx] self.save_group(face_group, file_name_group, group_dir) if masks is not None: # Save to masks dir group_dir += "_mask" masks = masks[[mask_indices.index(i) for i in group_idx]] self.save_group(masks, file_name_group, group_dir)
[docs] def process_batch(self, file_names: list[str], input_dir: str, output_dir: str): """Extracts faces from a batch of images and saves them. Takes file names, input directory, reads images and extracts faces and saves them to the output directory. This method works as follows: 1. *Batch generation* - a batch of images form the given file names is generated. Each images is padded and resized to ``self.resize_size`` while keeping the same aspect ratio. 2. *Landmark detection* - detection model is used to predict 5 landmarks for each face in each image, unless the landmarks were already initialized or face alignment + cropping is not needed. 3. *Image enhancement* - some images are enhanced if the faces compared with the image size are small. If landmarks are None, i.e., if no alignment + cropping was desired, all images are enhanced. Enhancement is not done if ``self.enh_threshold`` is None. 4. *Image grouping* - each face image is parsed, i.e., a map of face attributes is generated. Based on those attributes, each face image is put to a corresponding group. There may also be mask groups, in which case masks for each image in that group are also generated. Faces are not parsed if ``self.attr_groups`` and ``self.mask_groups`` are both None. 5. *Image saving* - each face image (and a potential mask) is saved according to the group structure (if there is any). Note: If detection model is not used, then batch is just a list of loaded images of different dimensions. Args: file_names: The list of image file names (not full paths). All the images should be in the same directory. input_dir: Path to input directory with image files. output_dir: Path to output directory to save the extracted face images. """ # Read images and filter valid corresponding file names images, file_names = read_images(file_names, input_dir) if self.landmarks is None and self.det_model is None: # One-to-one image to index mapping and no landmarks indices, landmarks = list(range(len(file_names))), None elif self.landmarks is not None: # Initialize empty idx lists and None paddings indices, indices_ldm, paddings = [], [], None for i, file_name in enumerate(file_names): # Check the indices of landmark sets in landmarks file indices_i = np.where(file_name == self.landmarks[1])[0] if len(indices_i) == 0: # Has no landmarks continue # Update img & ldm file name indices indices.extend([i] * len(indices_i)) indices_ldm.extend(indices_i.tolist()) # Set landmarks according to the indices landmarks = self.landmarks[0][indices_ldm] elif self.det_model is not None: # Create a batch of images (with faces) and their paddings images, _, paddings = as_batch(images, self.resize_size) images, paddings = as_tensor(images, self.device), paddings # If landmarks were not given, predict, undo padding landmarks, indices = self.det_model.predict(images) landmarks -= paddings[indices][:, None, [2, 0]] if landmarks is not None and len(landmarks) == 0: # Nothing to save return if landmarks is not None and landmarks.shape[1] != self.num_std_landmarks: # Compute the mean landmark coordinates from retrieved slices slices = get_ldm_slices(self.num_std_landmarks, landmarks.shape[1]) landmarks = np.stack([landmarks[:, s].mean(1) for s in slices], 1) if self.enh_model is not None: # Enhance some images images = as_tensor(images, self.device) images = self.enh_model.predict(images, landmarks, indices) # Convert to numpy images, initialize groups images, groups = as_numpy(images), (None, None) if landmarks is not None: # Generate source, target landmarks, estimate & apply transform images = self.crop_align(images, paddings, indices, landmarks) if self.par_model is not None: # Predict attribute and mask groups if face parsing desired groups = self.par_model.predict(as_tensor(images, self.device)) # Pick file names for each face, save faces (by groups if exist) self.save_groups(images, file_names[indices], output_dir, *groups)
[docs] def process_dir( self, input_dir: str, output_dir: str | None = None, desc: str | None = "Processing", ): """Processes images in the specified input directory. Splits all the file names in the input directory to batches and processes batches on multiple cores. For every file name batch, images are loaded, some are optionally enhanced, landmarks are generated and used to optionally align and center-crop faces, and grouping is optionally applied based on face attributes. For more details, check :meth:`process_batch`. Note: There might be a few seconds delay before the actual processing starts if there are a lot of files in the directory - it takes some time to split all the file names to batches. Args: input_dir: Path to input directory with image files. output_dir: Path to output directory to save the extracted (and optionally grouped to sub-directories) face images. If None, then the same path as for ``input_dir`` is used and additionally "_faces" suffix is added to the name. desc: The description to use for the progress bar. If specified as ``None``, no progress bar is shown. Defaults to "Processing". """ if output_dir is None: # Create a default output dir name output_dir = input_dir + "_faces" # Create batches of image file names in input dir files, bs = os.listdir(input_dir), self.batch_size file_batches = [files[i:i+bs] for i in range(0, len(files), bs)] if len(file_batches) == 0: # Empty return # Define worker function and its additional arguments kwargs = {"input_dir": input_dir, "output_dir": output_dir} worker = partial(self.process_batch, **kwargs) with ThreadPool(self.num_processes, self._init_models) as pool: # Create imap object and apply workers to it imap = pool.imap_unordered(worker, file_batches) if desc is not None: # If description is provided, wrap progress bar around imap = tqdm.tqdm(imap, total=len(file_batches), desc=desc) # Process list(imap)