[to #42322933]add cv-faceDetection and cv-faceRecognition

1. support FaceDetectionPipeline inference Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9470723
3 years ago · 7e0af3dddc
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,7 @@ source.sh
 tensorboard.sh
 .DS_Store
 replace.sh
 result.png

 # Pytorch
 *.pth
--- a/data/test/images/face_detection.png
+++ b/data/test/images/face_detection.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:aa3963d1c54e6d3d46e9a59872a99ed955d4050092f5cfe5f591e03d740b7042
 size 653006
--- a/data/test/images/face_recognition_1.png
+++ b/data/test/images/face_recognition_1.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:48e541daeb2692907efef47018e41abb5ae6bcd88eb5ff58290d7fe5dc8b2a13
 size 462584
--- a/data/test/images/face_recognition_2.png
+++ b/data/test/images/face_recognition_2.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:e9565b43d9f65361b9bad6553b327c2c6f02fd063a4c8dc0f461e88ea461989d
 size 357166
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -10,6 +10,7 @@ class Models(object):
        Model name should only contain model info but not task info.
    """
    # vision models
    scrfd = 'scrfd'
    classification_model = 'ClassificationModel'
    nafnet = 'nafnet'
    csrnet = 'csrnet'
@@ -67,6 +68,7 @@ class Pipelines(object):
    action_recognition = 'TAdaConv_action-recognition'
    animal_recognation = 'resnet101-animal_recog'
    cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    live_category = 'live-category'
    general_image_classification = 'vit-base_image-classification_ImageNet-labels'
    daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
@@ -76,6 +78,7 @@ class Pipelines(object):
    image_super_resolution = 'rrdb-image-super-resolution'
    face_image_generation = 'gan-face-image-generation'
    style_transfer = 'AAMS-style-transfer'
    face_recognition = 'ir101-face-recognition-cfglint'
    image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
    image2image_translation = 'image-to-image-translation'
    live_category = 'live-category'
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from . import (action_recognition, animal_recognition, cartoon,
               cmdssl_video_embedding, face_generation, image_classification,
               image_color_enhance, image_colorization, image_denoise,
               image_instance_segmentation, super_resolution, virual_tryon)
               cmdssl_video_embedding, face_detection, face_generation,
               image_classification, image_color_enhance, image_colorization,
               image_denoise, image_instance_segmentation,
               image_to_image_translation, super_resolution, virual_tryon)
--- a/modelscope/models/cv/face_detection/init.py
+++ b/modelscope/models/cv/face_detection/init.py
--- a/modelscope/models/cv/face_detection/mmdet_patch/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/init.py
@@ -0,0 +1,5 @@
 """
 mmdet_patch is based on
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet,
 all duplicate functions from official mmdetection are removed.
 """
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/init.py
@@ -0,0 +1,3 @@
 from .transforms import bbox2result, distance2kps, kps2distance

 __all__ = ['bbox2result', 'distance2kps', 'kps2distance']
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
@@ -0,0 +1,86 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
 """
 import numpy as np
 import torch


 def bbox2result(bboxes, labels, num_classes, kps=None):
    """Convert detection results to a list of numpy arrays.

    Args:
        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
        labels (torch.Tensor | np.ndarray): shape (n, )
        num_classes (int): class number, including background class

    Returns:
        list(ndarray): bbox results of each class
    """
    bbox_len = 5 if kps is None else 5 + 10  # if has kps, add 10 kps into bbox
    if bboxes.shape[0] == 0:
        return [
            np.zeros((0, bbox_len), dtype=np.float32)
            for i in range(num_classes)
        ]
    else:
        if isinstance(bboxes, torch.Tensor):
            bboxes = bboxes.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
        if kps is None:
            return [bboxes[labels == i, :] for i in range(num_classes)]
        else:  # with kps
            if isinstance(kps, torch.Tensor):
                kps = kps.detach().cpu().numpy()
                return [
                    np.hstack([bboxes[labels == i, :], kps[labels == i, :]])
                    for i in range(num_classes)
                ]


 def distance2kps(points, distance, max_shape=None):
    """Decode distance prediction to bounding box.

    Args:
        points (Tensor): Shape (n, 2), [x, y].
        distance (Tensor): Distance from the given point to 4
            boundaries (left, top, right, bottom).
        max_shape (tuple): Shape of the image.

    Returns:
        Tensor: Decoded kps.
    """
    preds = []
    for i in range(0, distance.shape[1], 2):
        px = points[:, i % 2] + distance[:, i]
        py = points[:, i % 2 + 1] + distance[:, i + 1]
        if max_shape is not None:
            px = px.clamp(min=0, max=max_shape[1])
            py = py.clamp(min=0, max=max_shape[0])
        preds.append(px)
        preds.append(py)
    return torch.stack(preds, -1)


 def kps2distance(points, kps, max_dis=None, eps=0.1):
    """Decode bounding box based on distances.

    Args:
        points (Tensor): Shape (n, 2), [x, y].
        kps (Tensor): Shape (n, K), "xyxy" format
        max_dis (float): Upper bound of the distance.
        eps (float): a small value to ensure target < max_dis, instead <=

    Returns:
        Tensor: Decoded distances.
    """

    preds = []
    for i in range(0, kps.shape[1], 2):
        px = kps[:, i] - points[:, i % 2]
        py = kps[:, i + 1] - points[:, i % 2 + 1]
        if max_dis is not None:
            px = px.clamp(min=0, max=max_dis - eps)
            py = py.clamp(min=0, max=max_dis - eps)
        preds.append(px)
        preds.append(py)
    return torch.stack(preds, -1)
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/init.py
@@ -0,0 +1,3 @@
 from .bbox_nms import multiclass_nms

 __all__ = ['multiclass_nms']
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
@@ -0,0 +1,85 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 """
 import torch


 def multiclass_nms(multi_bboxes,
                   multi_scores,
                   score_thr,
                   nms_cfg,
                   max_num=-1,
                   score_factors=None,
                   return_inds=False,
                   multi_kps=None):
    """NMS for multi-class bboxes.

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_scores (Tensor): shape (n, #class), where the last column
            contains scores of the background class, but this will be ignored.
        score_thr (float): bbox threshold, bboxes with scores lower than it
            will not be considered.
        nms_thr (float): NMS IoU threshold
        max_num (int, optional): if there are more than max_num bboxes after
            NMS, only top max_num will be kept. Default to -1.
        score_factors (Tensor, optional): The factors multiplied to scores
            before applying NMS. Default to None.
        return_inds (bool, optional): Whether return the indices of kept
            bboxes. Default to False.

    Returns:
        tuple: (bboxes, labels, indices (optional)), tensors of shape (k, 5),
            (k), and (k). Labels are 0-based.
    """
    num_classes = multi_scores.size(1) - 1
    # exclude background category
    kps = None
    if multi_bboxes.shape[1] > 4:
        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
        if multi_kps is not None:
            kps = multi_kps.view(multi_scores.size(0), -1, 10)
    else:
        bboxes = multi_bboxes[:, None].expand(
            multi_scores.size(0), num_classes, 4)
        if multi_kps is not None:
            kps = multi_kps[:, None].expand(
                multi_scores.size(0), num_classes, 10)

    scores = multi_scores[:, :-1]
    if score_factors is not None:
        scores = scores * score_factors[:, None]

    labels = torch.arange(num_classes, dtype=torch.long)
    labels = labels.view(1, -1).expand_as(scores)

    bboxes = bboxes.reshape(-1, 4)
    if kps is not None:
        kps = kps.reshape(-1, 10)
    scores = scores.reshape(-1)
    labels = labels.reshape(-1)

    # remove low scoring boxes
    valid_mask = scores > score_thr
    inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
    bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
    if kps is not None:
        kps = kps[inds]
    if inds.numel() == 0:
        if torch.onnx.is_in_onnx_export():
            raise RuntimeError('[ONNX Error] Can not record NMS '
                               'as it has not been executed this time')
        return bboxes, labels, kps

    # TODO: add size check before feed into batched_nms
    from mmcv.ops.nms import batched_nms
    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)

    if max_num > 0:
        dets = dets[:max_num]
        keep = keep[:max_num]

    if return_inds:
        return dets, labels[keep], kps[keep], keep
    else:
        return dets, labels[keep], kps[keep]
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/init.py
@@ -0,0 +1,3 @@
 from .retinaface import RetinaFaceDataset

 __all__ = ['RetinaFaceDataset']
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/init.py
@@ -0,0 +1,3 @@
 from .transforms import RandomSquareCrop

 __all__ = ['RandomSquareCrop']
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
@@ -0,0 +1,188 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import numpy as np
 from mmdet.datasets.builder import PIPELINES
 from numpy import random


@PIPELINES.register_module()
 class RandomSquareCrop(object):
    """Random crop the image & bboxes, the cropped patches have minimum IoU
    requirement with original image & bboxes, the IoU threshold is randomly
    selected from min_ious.

    Args:
        min_ious (tuple): minimum IoU threshold for all intersections with
        bounding boxes
        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
        where a >= min_crop_size).

    Note:
        The keys for bboxes, labels and masks should be paired. That is, \
        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
    """

    def __init__(self,
                 crop_ratio_range=None,
                 crop_choice=None,
                 bbox_clip_border=True):

        self.crop_ratio_range = crop_ratio_range
        self.crop_choice = crop_choice
        self.bbox_clip_border = bbox_clip_border

        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
        if self.crop_ratio_range is not None:
            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range

        self.bbox2label = {
            'gt_bboxes': 'gt_labels',
            'gt_bboxes_ignore': 'gt_labels_ignore'
        }
        self.bbox2mask = {
            'gt_bboxes': 'gt_masks',
            'gt_bboxes_ignore': 'gt_masks_ignore'
        }

    def __call__(self, results):
        """Call function to crop images and bounding boxes with minimum IoU
        constraint.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Result dict with images and bounding boxes cropped, \
                'img_shape' key is updated.
        """

        if 'img_fields' in results:
            assert results['img_fields'] == ['img'], \
                'Only single img_fields is allowed'
        img = results['img']
        assert 'bbox_fields' in results
        assert 'gt_bboxes' in results
        boxes = results['gt_bboxes']
        h, w, c = img.shape
        scale_retry = 0
        if self.crop_ratio_range is not None:
            max_scale = self.crop_ratio_max
        else:
            max_scale = np.amax(self.crop_choice)
        while True:
            scale_retry += 1

            if scale_retry == 1 or max_scale > 1.0:
                if self.crop_ratio_range is not None:
                    scale = np.random.uniform(self.crop_ratio_min,
                                              self.crop_ratio_max)
                elif self.crop_choice is not None:
                    scale = np.random.choice(self.crop_choice)
            else:
                scale = scale * 1.2

            for i in range(250):
                short_side = min(w, h)
                cw = int(scale * short_side)
                ch = cw

                # TODO +1
                if w == cw:
                    left = 0
                elif w > cw:
                    left = random.randint(0, w - cw)
                else:
                    left = random.randint(w - cw, 0)
                if h == ch:
                    top = 0
                elif h > ch:
                    top = random.randint(0, h - ch)
                else:
                    top = random.randint(h - ch, 0)

                patch = np.array(
                    (int(left), int(top), int(left + cw), int(top + ch)),
                    dtype=np.int)

                # center of boxes should inside the crop img
                # only adjust boxes and instance masks when the gt is not empty
                # adjust boxes
                def is_center_of_bboxes_in_patch(boxes, patch):
                    # TODO >=
                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
                    mask = \
                        ((center[:, 0] > patch[0])
                         * (center[:, 1] > patch[1])
                         * (center[:, 0] < patch[2])
                         * (center[:, 1] < patch[3]))
                    return mask

                mask = is_center_of_bboxes_in_patch(boxes, patch)
                if not mask.any():
                    continue
                for key in results.get('bbox_fields', []):
                    boxes = results[key].copy()
                    mask = is_center_of_bboxes_in_patch(boxes, patch)
                    boxes = boxes[mask]
                    if self.bbox_clip_border:
                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
                    boxes -= np.tile(patch[:2], 2)

                    results[key] = boxes
                    # labels
                    label_key = self.bbox2label.get(key)
                    if label_key in results:
                        results[label_key] = results[label_key][mask]

                    # keypoints field
                    if key == 'gt_bboxes':
                        for kps_key in results.get('keypoints_fields', []):
                            keypointss = results[kps_key].copy()
                            keypointss = keypointss[mask, :, :]
                            if self.bbox_clip_border:
                                keypointss[:, :, :
                                           2] = keypointss[:, :, :2].clip(
                                               max=patch[2:])
                                keypointss[:, :, :
                                           2] = keypointss[:, :, :2].clip(
                                               min=patch[:2])
                            keypointss[:, :, 0] -= patch[0]
                            keypointss[:, :, 1] -= patch[1]
                            results[kps_key] = keypointss

                    # mask fields
                    mask_key = self.bbox2mask.get(key)
                    if mask_key in results:
                        results[mask_key] = results[mask_key][mask.nonzero()
                                                              [0]].crop(patch)

                # adjust the img no matter whether the gt is empty before crop
                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
                patch_from = patch.copy()
                patch_from[0] = max(0, patch_from[0])
                patch_from[1] = max(0, patch_from[1])
                patch_from[2] = min(img.shape[1], patch_from[2])
                patch_from[3] = min(img.shape[0], patch_from[3])
                patch_to = patch.copy()
                patch_to[0] = max(0, patch_to[0] * -1)
                patch_to[1] = max(0, patch_to[1] * -1)
                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
                rimg[patch_to[1]:patch_to[3],
                     patch_to[0]:patch_to[2], :] = img[
                         patch_from[1]:patch_from[3],
                         patch_from[0]:patch_from[2], :]
                img = rimg
                results['img'] = img
                results['img_shape'] = img.shape

                return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += f'(min_ious={self.min_iou}, '
        repr_str += f'crop_size={self.crop_size})'
        return repr_str
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
@@ -0,0 +1,151 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
 """
 import numpy as np
 from mmdet.datasets.builder import DATASETS
 from mmdet.datasets.custom import CustomDataset


@DATASETS.register_module()
 class RetinaFaceDataset(CustomDataset):

    CLASSES = ('FG', )

    def __init__(self, min_size=None, **kwargs):
        self.NK = 5
        self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
        self.min_size = min_size
        self.gt_path = kwargs.get('gt_path')
        super(RetinaFaceDataset, self).__init__(**kwargs)

    def _parse_ann_line(self, line):
        values = [float(x) for x in line.strip().split()]
        bbox = np.array(values[0:4], dtype=np.float32)
        kps = np.zeros((self.NK, 3), dtype=np.float32)
        ignore = False
        if self.min_size is not None:
            assert not self.test_mode
            w = bbox[2] - bbox[0]
            h = bbox[3] - bbox[1]
            if w < self.min_size or h < self.min_size:
                ignore = True
        if len(values) > 4:
            if len(values) > 5:
                kps = np.array(
                    values[4:19], dtype=np.float32).reshape((self.NK, 3))
                for li in range(kps.shape[0]):
                    if (kps[li, :] == -1).all():
                        kps[li][2] = 0.0  # weight = 0, ignore
                    else:
                        assert kps[li][2] >= 0
                        kps[li][2] = 1.0  # weight
            else:  # len(values)==5
                if not ignore:
                    ignore = (values[4] == 1)
        else:
            assert self.test_mode

        return dict(bbox=bbox, kps=kps, ignore=ignore, cat='FG')

    def load_annotations(self, ann_file):
        """Load annotation from COCO style annotation file.

        Args:
            ann_file (str): Path of annotation file.
            20220711@tyx: ann_file is list of img paths is supported

        Returns:
            list[dict]: Annotation info from COCO api.
        """
        if isinstance(ann_file, list):
            data_infos = []
            for line in ann_file:
                name = line
                objs = [0, 0, 0, 0]
                data_infos.append(
                    dict(filename=name, width=0, height=0, objs=objs))
        else:
            name = None
            bbox_map = {}
            for line in open(ann_file, 'r'):
                line = line.strip()
                if line.startswith('#'):
                    value = line[1:].strip().split()
                    name = value[0]
                    width = int(value[1])
                    height = int(value[2])

                    bbox_map[name] = dict(width=width, height=height, objs=[])
                    continue
                assert name is not None
                assert name in bbox_map
                bbox_map[name]['objs'].append(line)
            print('origin image size', len(bbox_map))
            data_infos = []
            for name in bbox_map:
                item = bbox_map[name]
                width = item['width']
                height = item['height']
                vals = item['objs']
                objs = []
                for line in vals:
                    data = self._parse_ann_line(line)
                    if data is None:
                        continue
                    objs.append(data)  # data is (bbox, kps, cat)
                if len(objs) == 0 and not self.test_mode:
                    continue
                data_infos.append(
                    dict(filename=name, width=width, height=height, objs=objs))
        return data_infos

    def get_ann_info(self, idx):
        """Get COCO annotation by index.

        Args:
            idx (int): Index of data.

        Returns:
            dict: Annotation info of specified index.
        """
        data_info = self.data_infos[idx]

        bboxes = []
        keypointss = []
        labels = []
        bboxes_ignore = []
        labels_ignore = []
        for obj in data_info['objs']:
            label = self.cat2label[obj['cat']]
            bbox = obj['bbox']
            keypoints = obj['kps']
            ignore = obj['ignore']
            if ignore:
                bboxes_ignore.append(bbox)
                labels_ignore.append(label)
            else:
                bboxes.append(bbox)
                labels.append(label)
                keypointss.append(keypoints)
        if not bboxes:
            bboxes = np.zeros((0, 4))
            labels = np.zeros((0, ))
            keypointss = np.zeros((0, self.NK, 3))
        else:
            # bboxes = np.array(bboxes, ndmin=2) - 1
            bboxes = np.array(bboxes, ndmin=2)
            labels = np.array(labels)
            keypointss = np.array(keypointss, ndmin=3)
        if not bboxes_ignore:
            bboxes_ignore = np.zeros((0, 4))
            labels_ignore = np.zeros((0, ))
        else:
            bboxes_ignore = np.array(bboxes_ignore, ndmin=2)
            labels_ignore = np.array(labels_ignore)
        ann = dict(
            bboxes=bboxes.astype(np.float32),
            labels=labels.astype(np.int64),
            keypointss=keypointss.astype(np.float32),
            bboxes_ignore=bboxes_ignore.astype(np.float32),
            labels_ignore=labels_ignore.astype(np.int64))
        return ann
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/init.py
@@ -0,0 +1,2 @@
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/init.py
@@ -0,0 +1,3 @@
 from .resnet import ResNetV1e

 __all__ = ['ResNetV1e']
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
@@ -0,0 +1,412 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
 """
 import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
                      constant_init, kaiming_init)
 from mmcv.runner import load_checkpoint
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
 from mmdet.models.builder import BACKBONES
 from mmdet.models.utils import ResLayer
 from mmdet.utils import get_root_logger
 from torch.nn.modules.batchnorm import _BatchNorm


 class ResNet(nn.Module):
    """ResNet backbone.

    Args:
        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
        stem_channels (int | None): Number of stem channels. If not specified,
            it will be the same as `base_channels`. Default: None.
        base_channels (int): Number of base channels of res layer. Default: 64.
        in_channels (int): Number of input image channels. Default: 3.
        num_stages (int): Resnet stages. Default: 4.
        strides (Sequence[int]): Strides of the first block of each stage.
        dilations (Sequence[int]): Dilation of each stage.
        out_indices (Sequence[int]): Output from which stages.
        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
            layer is the 3x3 conv layer, otherwise the stride-two layer is
            the first 1x1 conv layer.
        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
        avg_down (bool): Use AvgPool instead of stride conv when
            downsampling in the bottleneck.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        norm_cfg (dict): Dictionary to construct and config norm layer.
        norm_eval (bool): Whether to set norm layers to eval mode, namely,
            freeze running stats (mean and var). Note: Effect on Batch Norm
            and its variants only.
        plugins (list[dict]): List of plugins for stages, each dict contains:

            - cfg (dict, required): Cfg dict to build plugin.
            - position (str, required): Position inside block to insert
              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
            - stages (tuple[bool], optional): Stages to apply plugin, length
              should be same as 'num_stages'.
        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
            memory while slowing down the training speed.
        zero_init_residual (bool): Whether to use zero init for last norm layer
            in resblocks to let them behave as identity.

    Example:
        >>> from mmdet.models import ResNet
        >>> import torch
        >>> self = ResNet(depth=18)
        >>> self.eval()
        >>> inputs = torch.rand(1, 3, 32, 32)
        >>> level_outputs = self.forward(inputs)
        >>> for level_out in level_outputs:
        ...     print(tuple(level_out.shape))
        (1, 64, 8, 8)
        (1, 128, 4, 4)
        (1, 256, 2, 2)
        (1, 512, 1, 1)
    """

    arch_settings = {
        0: (BasicBlock, (2, 2, 2, 2)),
        18: (BasicBlock, (2, 2, 2, 2)),
        19: (BasicBlock, (2, 4, 4, 1)),
        20: (BasicBlock, (2, 3, 2, 2)),
        22: (BasicBlock, (2, 4, 3, 1)),
        24: (BasicBlock, (2, 4, 4, 1)),
        26: (BasicBlock, (2, 4, 4, 2)),
        28: (BasicBlock, (2, 5, 4, 2)),
        29: (BasicBlock, (2, 6, 3, 2)),
        30: (BasicBlock, (2, 5, 5, 2)),
        32: (BasicBlock, (2, 6, 5, 2)),
        34: (BasicBlock, (3, 4, 6, 3)),
        35: (BasicBlock, (3, 6, 4, 3)),
        38: (BasicBlock, (3, 8, 4, 3)),
        40: (BasicBlock, (3, 8, 5, 3)),
        50: (Bottleneck, (3, 4, 6, 3)),
        56: (Bottleneck, (3, 8, 4, 3)),
        68: (Bottleneck, (3, 10, 6, 3)),
        74: (Bottleneck, (3, 12, 6, 3)),
        101: (Bottleneck, (3, 4, 23, 3)),
        152: (Bottleneck, (3, 8, 36, 3))
    }

    def __init__(self,
                 depth,
                 in_channels=3,
                 stem_channels=None,
                 base_channels=64,
                 num_stages=4,
                 block_cfg=None,
                 strides=(1, 2, 2, 2),
                 dilations=(1, 1, 1, 1),
                 out_indices=(0, 1, 2, 3),
                 style='pytorch',
                 deep_stem=False,
                 avg_down=False,
                 no_pool33=False,
                 frozen_stages=-1,
                 conv_cfg=None,
                 norm_cfg=dict(type='BN', requires_grad=True),
                 norm_eval=True,
                 dcn=None,
                 stage_with_dcn=(False, False, False, False),
                 plugins=None,
                 with_cp=False,
                 zero_init_residual=True):
        super(ResNet, self).__init__()
        if depth not in self.arch_settings:
            raise KeyError(f'invalid depth {depth} for resnet')
        self.depth = depth
        if stem_channels is None:
            stem_channels = base_channels
        self.stem_channels = stem_channels
        self.base_channels = base_channels
        self.num_stages = num_stages
        assert num_stages >= 1 and num_stages <= 4
        self.strides = strides
        self.dilations = dilations
        assert len(strides) == len(dilations) == num_stages
        self.out_indices = out_indices
        assert max(out_indices) < num_stages
        self.style = style
        self.deep_stem = deep_stem
        self.avg_down = avg_down
        self.no_pool33 = no_pool33
        self.frozen_stages = frozen_stages
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.with_cp = with_cp
        self.norm_eval = norm_eval
        self.dcn = dcn
        self.stage_with_dcn = stage_with_dcn
        if dcn is not None:
            assert len(stage_with_dcn) == num_stages
        self.plugins = plugins
        self.zero_init_residual = zero_init_residual
        if block_cfg is None:
            self.block, stage_blocks = self.arch_settings[depth]
        else:
            self.block = BasicBlock if block_cfg[
                'block'] == 'BasicBlock' else Bottleneck
            stage_blocks = block_cfg['stage_blocks']
            assert len(stage_blocks) >= num_stages
        self.stage_blocks = stage_blocks[:num_stages]
        self.inplanes = stem_channels

        self._make_stem_layer(in_channels, stem_channels)
        if block_cfg is not None and 'stage_planes' in block_cfg:
            stage_planes = block_cfg['stage_planes']
        else:
            stage_planes = [base_channels * 2**i for i in range(num_stages)]

        # print('resnet cfg:', stage_blocks, stage_planes)
        self.res_layers = []
        for i, num_blocks in enumerate(self.stage_blocks):
            stride = strides[i]
            dilation = dilations[i]
            dcn = self.dcn if self.stage_with_dcn[i] else None
            if plugins is not None:
                stage_plugins = self.make_stage_plugins(plugins, i)
            else:
                stage_plugins = None
            planes = stage_planes[i]
            res_layer = self.make_res_layer(
                block=self.block,
                inplanes=self.inplanes,
                planes=planes,
                num_blocks=num_blocks,
                stride=stride,
                dilation=dilation,
                style=self.style,
                avg_down=self.avg_down,
                with_cp=with_cp,
                conv_cfg=conv_cfg,
                norm_cfg=norm_cfg,
                dcn=dcn,
                plugins=stage_plugins)
            self.inplanes = planes * self.block.expansion
            layer_name = f'layer{i + 1}'
            self.add_module(layer_name, res_layer)
            self.res_layers.append(layer_name)

        self._freeze_stages()

        self.feat_dim = self.block.expansion * base_channels * 2**(
            len(self.stage_blocks) - 1)

    def make_stage_plugins(self, plugins, stage_idx):
        """Make plugins for ResNet ``stage_idx`` th stage.

        Currently we support to insert ``context_block``,
        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
        Bottleneck.

        An example of plugins format could be:

        Examples:
            >>> plugins=[
            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
            ...          stages=(False, True, True, True),
            ...          position='after_conv2'),
            ...     dict(cfg=dict(type='yyy'),
            ...          stages=(True, True, True, True),
            ...          position='after_conv3'),
            ...     dict(cfg=dict(type='zzz', postfix='1'),
            ...          stages=(True, True, True, True),
            ...          position='after_conv3'),
            ...     dict(cfg=dict(type='zzz', postfix='2'),
            ...          stages=(True, True, True, True),
            ...          position='after_conv3')
            ... ]
            >>> self = ResNet(depth=18)
            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
            >>> assert len(stage_plugins) == 3

        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:

        .. code-block:: none

            conv1-> conv2->conv3->yyy->zzz1->zzz2

        Suppose 'stage_idx=1', the structure of blocks in the stage would be:

        .. code-block:: none

            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2

        If stages is missing, the plugin would be applied to all stages.

        Args:
            plugins (list[dict]): List of plugins cfg to build. The postfix is
                required if multiple same type plugins are inserted.
            stage_idx (int): Index of stage to build

        Returns:
            list[dict]: Plugins for current stage
        """
        stage_plugins = []
        for plugin in plugins:
            plugin = plugin.copy()
            stages = plugin.pop('stages', None)
            assert stages is None or len(stages) == self.num_stages
            # whether to insert plugin into current stage
            if stages is None or stages[stage_idx]:
                stage_plugins.append(plugin)

        return stage_plugins

    def make_res_layer(self, **kwargs):
        """Pack all blocks in a stage into a ``ResLayer``."""
        return ResLayer(**kwargs)

    @property
    def norm1(self):
        """nn.Module: the normalization layer named "norm1" """
        return getattr(self, self.norm1_name)

    def _make_stem_layer(self, in_channels, stem_channels):
        if self.deep_stem:
            self.stem = nn.Sequential(
                build_conv_layer(
                    self.conv_cfg,
                    in_channels,
                    stem_channels // 2,
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    bias=False),
                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
                nn.ReLU(inplace=True),
                build_conv_layer(
                    self.conv_cfg,
                    stem_channels // 2,
                    stem_channels // 2,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias=False),
                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
                nn.ReLU(inplace=True),
                build_conv_layer(
                    self.conv_cfg,
                    stem_channels // 2,
                    stem_channels,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias=False),
                build_norm_layer(self.norm_cfg, stem_channels)[1],
                nn.ReLU(inplace=True))
        else:
            self.conv1 = build_conv_layer(
                self.conv_cfg,
                in_channels,
                stem_channels,
                kernel_size=7,
                stride=2,
                padding=3,
                bias=False)
            self.norm1_name, norm1 = build_norm_layer(
                self.norm_cfg, stem_channels, postfix=1)
            self.add_module(self.norm1_name, norm1)
            self.relu = nn.ReLU(inplace=True)
        if self.no_pool33:
            assert self.deep_stem
            self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        else:
            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            if self.deep_stem:
                self.stem.eval()
                for param in self.stem.parameters():
                    param.requires_grad = False
            else:
                self.norm1.eval()
                for m in [self.conv1, self.norm1]:
                    for param in m.parameters():
                        param.requires_grad = False

        for i in range(1, self.frozen_stages + 1):
            m = getattr(self, f'layer{i}')
            m.eval()
            for param in m.parameters():
                param.requires_grad = False

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        if isinstance(pretrained, str):
            logger = get_root_logger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    kaiming_init(m)
                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
                    constant_init(m, 1)

            if self.dcn is not None:
                for m in self.modules():
                    if isinstance(m, Bottleneck) and hasattr(
                            m.conv2, 'conv_offset'):
                        constant_init(m.conv2.conv_offset, 0)

            if self.zero_init_residual:
                for m in self.modules():
                    if isinstance(m, Bottleneck):
                        constant_init(m.norm3, 0)
                    elif isinstance(m, BasicBlock):
                        constant_init(m.norm2, 0)
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x):
        """Forward function."""
        if self.deep_stem:
            x = self.stem(x)
        else:
            x = self.conv1(x)
            x = self.norm1(x)
            x = self.relu(x)
        x = self.maxpool(x)
        outs = []
        for i, layer_name in enumerate(self.res_layers):
            res_layer = getattr(self, layer_name)
            x = res_layer(x)
            if i in self.out_indices:
                outs.append(x)
        return tuple(outs)

    def train(self, mode=True):
        """Convert the model into training mode while keep normalization layer
        freezed."""
        super(ResNet, self).train(mode)
        self._freeze_stages()
        if mode and self.norm_eval:
            for m in self.modules():
                # trick: eval have effect on BatchNorm only
                if isinstance(m, _BatchNorm):
                    m.eval()


@BACKBONES.register_module()
 class ResNetV1e(ResNet):
    r"""ResNetV1d variant described in `Bag of Tricks
    <https://arxiv.org/pdf/1812.01187.pdf>`_.

    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
    avg_pool with stride 2 is added before conv, whose stride is changed to 1.

    Compared with ResNetV1d, ResNetV1e change maxpooling from 3x3 to 2x2 pad=1
    """

    def __init__(self, **kwargs):
        super(ResNetV1e, self).__init__(
            deep_stem=True, avg_down=True, no_pool33=True, **kwargs)
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/init.py
@@ -0,0 +1,3 @@
 from .scrfd_head import SCRFDHead

 __all__ = ['SCRFDHead']
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/init.py
@@ -0,0 +1,3 @@
 from .scrfd import SCRFD

 __all__ = ['SCRFD']
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
@@ -0,0 +1,109 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 """
 import torch
 from mmdet.models.builder import DETECTORS
 from mmdet.models.detectors.single_stage import SingleStageDetector

 from ....mmdet_patch.core.bbox import bbox2result


@DETECTORS.register_module()
 class SCRFD(SingleStageDetector):

    def __init__(self,
                 backbone,
                 neck,
                 bbox_head,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None):
        super(SCRFD, self).__init__(backbone, neck, bbox_head, train_cfg,
                                    test_cfg, pretrained)

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes,
                      gt_labels,
                      gt_keypointss=None,
                      gt_bboxes_ignore=None):
        """
        Args:
            img (Tensor): Input images of shape (N, C, H, W).
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): A List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
                image in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): Class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
                boxes can be ignored when computing the loss.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        super(SingleStageDetector, self).forward_train(img, img_metas)
        x = self.extract_feat(img)
        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
                                              gt_labels, gt_keypointss,
                                              gt_bboxes_ignore)
        return losses

    def simple_test(self, img, img_metas, rescale=False):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool, optional): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        x = self.extract_feat(img)
        outs = self.bbox_head(x)
        if torch.onnx.is_in_onnx_export():
            print('single_stage.py in-onnx-export')
            print(outs.__class__)
            cls_score, bbox_pred, kps_pred = outs
            for c in cls_score:
                print(c.shape)
            for c in bbox_pred:
                print(c.shape)
            if self.bbox_head.use_kps:
                for c in kps_pred:
                    print(c.shape)
                return (cls_score, bbox_pred, kps_pred)
            else:
                return (cls_score, bbox_pred)
        bbox_list = self.bbox_head.get_bboxes(
            *outs, img_metas, rescale=rescale)

        # return kps if use_kps
        if len(bbox_list[0]) == 2:
            bbox_results = [
                bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
                for det_bboxes, det_labels in bbox_list
            ]
        elif len(bbox_list[0]) == 3:
            bbox_results = [
                bbox2result(
                    det_bboxes,
                    det_labels,
                    self.bbox_head.num_classes,
                    kps=det_kps)
                for det_bboxes, det_labels, det_kps in bbox_list
            ]
        return bbox_results

    def feature_test(self, img):
        x = self.extract_feat(img)
        outs = self.bbox_head(x)
        return outs
--- a/modelscope/models/cv/face_recognition/init.py
+++ b/modelscope/models/cv/face_recognition/init.py
--- a/modelscope/models/cv/face_recognition/align_face.py
+++ b/modelscope/models/cv/face_recognition/align_face.py
@@ -0,0 +1,50 @@
 import cv2
 import numpy as np
 from skimage import transform as trans


 def align_face(image, size, lmks):
    dst_w = size[1]
    dst_h = size[0]
    # landmark calculation of dst images
    base_w = 96
    base_h = 112
    assert (dst_w >= base_w)
    assert (dst_h >= base_h)
    base_lmk = [
        30.2946, 51.6963, 65.5318, 51.5014, 48.0252, 71.7366, 33.5493, 92.3655,
        62.7299, 92.2041
    ]

    dst_lmk = np.array(base_lmk).reshape((5, 2)).astype(np.float32)
    if dst_w != base_w:
        slide = (dst_w - base_w) / 2
        dst_lmk[:, 0] += slide

    if dst_h != base_h:
        slide = (dst_h - base_h) / 2
        dst_lmk[:, 1] += slide

    src_lmk = lmks
    # using skimage method
    tform = trans.SimilarityTransform()
    tform.estimate(src_lmk, dst_lmk)
    t = tform.params[0:2, :]

    assert (image.shape[2] == 3)

    dst_image = cv2.warpAffine(image.copy(), t, (dst_w, dst_h))
    dst_pts = GetAffinePoints(src_lmk, t)
    return dst_image, dst_pts


 def GetAffinePoints(pts_in, trans):
    pts_out = pts_in.copy()
    assert (pts_in.shape[1] == 2)

    for k in range(pts_in.shape[0]):
        pts_out[k, 0] = pts_in[k, 0] * trans[0, 0] + pts_in[k, 1] * trans[
            0, 1] + trans[0, 2]
        pts_out[k, 1] = pts_in[k, 0] * trans[1, 0] + pts_in[k, 1] * trans[
            1, 1] + trans[1, 2]
    return pts_out
--- a/modelscope/models/cv/face_recognition/torchkit/init.py
+++ b/modelscope/models/cv/face_recognition/torchkit/init.py
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/init.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/init.py
@@ -0,0 +1,31 @@
 from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50,
                         IR_SE_101, IR_SE_152, IR_SE_200)
 from .model_resnet import ResNet_50, ResNet_101, ResNet_152

 _model_dict = {
    'ResNet_50': ResNet_50,
    'ResNet_101': ResNet_101,
    'ResNet_152': ResNet_152,
    'IR_18': IR_18,
    'IR_34': IR_34,
    'IR_50': IR_50,
    'IR_101': IR_101,
    'IR_152': IR_152,
    'IR_200': IR_200,
    'IR_SE_50': IR_SE_50,
    'IR_SE_101': IR_SE_101,
    'IR_SE_152': IR_SE_152,
    'IR_SE_200': IR_SE_200
 }


 def get_model(key):
    """ Get different backbone network by key,
        support ResNet50, ResNet_101, ResNet_152
        IR_18, IR_34, IR_50, IR_101, IR_152, IR_200,
        IR_SE_50, IR_SE_101, IR_SE_152, IR_SE_200.
    """
    if key in _model_dict.keys():
        return _model_dict[key]
    else:
        raise KeyError('not support model {}'.format(key))
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
@@ -0,0 +1,68 @@
 import torch
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Linear, Module, ReLU,
                      Sigmoid)


 def initialize_weights(modules):
    """ Weight initilize, conv2d and linear is initialized with kaiming_normal
    """
    for m in modules:
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(
                m.weight, mode='fan_out', nonlinearity='relu')
            if m.bias is not None:
                m.bias.data.zero_()
        elif isinstance(m, nn.BatchNorm2d):
            m.weight.data.fill_(1)
            m.bias.data.zero_()
        elif isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(
                m.weight, mode='fan_out', nonlinearity='relu')
            if m.bias is not None:
                m.bias.data.zero_()


 class Flatten(Module):
    """ Flat tensor
    """

    def forward(self, input):
        return input.view(input.size(0), -1)


 class SEModule(Module):
    """ SE block
    """

    def __init__(self, channels, reduction):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc1 = Conv2d(
            channels,
            channels // reduction,
            kernel_size=1,
            padding=0,
            bias=False)

        nn.init.xavier_uniform_(self.fc1.weight.data)

        self.relu = ReLU(inplace=True)
        self.fc2 = Conv2d(
            channels // reduction,
            channels,
            kernel_size=1,
            padding=0,
            bias=False)

        self.sigmoid = Sigmoid()

    def forward(self, x):
        module_input = x
        x = self.avg_pool(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)

        return module_input * x
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
@@ -0,0 +1,279 @@
 # based on:
 # https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_irse.py
 from collections import namedtuple

 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                      MaxPool2d, Module, PReLU, Sequential)

 from .common import Flatten, SEModule, initialize_weights


 class BasicBlockIR(Module):
    """ BasicBlock for IRNet
    """

    def __init__(self, in_channel, depth, stride):
        super(BasicBlockIR, self).__init__()
        if in_channel == depth:
            self.shortcut_layer = MaxPool2d(1, stride)
        else:
            self.shortcut_layer = Sequential(
                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
                BatchNorm2d(depth))
        self.res_layer = Sequential(
            BatchNorm2d(in_channel),
            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
            BatchNorm2d(depth), PReLU(depth),
            Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
            BatchNorm2d(depth))

    def forward(self, x):
        shortcut = self.shortcut_layer(x)
        res = self.res_layer(x)

        return res + shortcut


 class BottleneckIR(Module):
    """ BasicBlock with bottleneck for IRNet
    """

    def __init__(self, in_channel, depth, stride):
        super(BottleneckIR, self).__init__()
        reduction_channel = depth // 4
        if in_channel == depth:
            self.shortcut_layer = MaxPool2d(1, stride)
        else:
            self.shortcut_layer = Sequential(
                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
                BatchNorm2d(depth))
        self.res_layer = Sequential(
            BatchNorm2d(in_channel),
            Conv2d(
                in_channel, reduction_channel, (1, 1), (1, 1), 0, bias=False),
            BatchNorm2d(reduction_channel), PReLU(reduction_channel),
            Conv2d(
                reduction_channel,
                reduction_channel, (3, 3), (1, 1),
                1,
                bias=False), BatchNorm2d(reduction_channel),
            PReLU(reduction_channel),
            Conv2d(reduction_channel, depth, (1, 1), stride, 0, bias=False),
            BatchNorm2d(depth))

    def forward(self, x):
        shortcut = self.shortcut_layer(x)
        res = self.res_layer(x)

        return res + shortcut


 class BasicBlockIRSE(BasicBlockIR):

    def __init__(self, in_channel, depth, stride):
        super(BasicBlockIRSE, self).__init__(in_channel, depth, stride)
        self.res_layer.add_module('se_block', SEModule(depth, 16))


 class BottleneckIRSE(BottleneckIR):

    def __init__(self, in_channel, depth, stride):
        super(BottleneckIRSE, self).__init__(in_channel, depth, stride)
        self.res_layer.add_module('se_block', SEModule(depth, 16))


 class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
    '''A named tuple describing a ResNet block.'''


 def get_block(in_channel, depth, num_units, stride=2):

    return [Bottleneck(in_channel, depth, stride)] +\
           [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]


 def get_blocks(num_layers):
    if num_layers == 18:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=2),
            get_block(in_channel=64, depth=128, num_units=2),
            get_block(in_channel=128, depth=256, num_units=2),
            get_block(in_channel=256, depth=512, num_units=2)
        ]
    elif num_layers == 34:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=3),
            get_block(in_channel=64, depth=128, num_units=4),
            get_block(in_channel=128, depth=256, num_units=6),
            get_block(in_channel=256, depth=512, num_units=3)
        ]
    elif num_layers == 50:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=3),
            get_block(in_channel=64, depth=128, num_units=4),
            get_block(in_channel=128, depth=256, num_units=14),
            get_block(in_channel=256, depth=512, num_units=3)
        ]
    elif num_layers == 100:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=3),
            get_block(in_channel=64, depth=128, num_units=13),
            get_block(in_channel=128, depth=256, num_units=30),
            get_block(in_channel=256, depth=512, num_units=3)
        ]
    elif num_layers == 152:
        blocks = [
            get_block(in_channel=64, depth=256, num_units=3),
            get_block(in_channel=256, depth=512, num_units=8),
            get_block(in_channel=512, depth=1024, num_units=36),
            get_block(in_channel=1024, depth=2048, num_units=3)
        ]
    elif num_layers == 200:
        blocks = [
            get_block(in_channel=64, depth=256, num_units=3),
            get_block(in_channel=256, depth=512, num_units=24),
            get_block(in_channel=512, depth=1024, num_units=36),
            get_block(in_channel=1024, depth=2048, num_units=3)
        ]

    return blocks


 class Backbone(Module):

    def __init__(self, input_size, num_layers, mode='ir'):
        """ Args:
            input_size: input_size of backbone
            num_layers: num_layers of backbone
            mode: support ir or irse
        """
        super(Backbone, self).__init__()
        assert input_size[0] in [112, 224], \
            'input_size should be [112, 112] or [224, 224]'
        assert num_layers in [18, 34, 50, 100, 152, 200], \
            'num_layers should be 18, 34, 50, 100 or 152'
        assert mode in ['ir', 'ir_se'], \
            'mode should be ir or ir_se'
        self.input_layer = Sequential(
            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
            PReLU(64))
        blocks = get_blocks(num_layers)
        if num_layers <= 100:
            if mode == 'ir':
                unit_module = BasicBlockIR
            elif mode == 'ir_se':
                unit_module = BasicBlockIRSE
            output_channel = 512
        else:
            if mode == 'ir':
                unit_module = BottleneckIR
            elif mode == 'ir_se':
                unit_module = BottleneckIRSE
            output_channel = 2048

        if input_size[0] == 112:
            self.output_layer = Sequential(
                BatchNorm2d(output_channel), Dropout(0.4), Flatten(),
                Linear(output_channel * 7 * 7, 512),
                BatchNorm1d(512, affine=False))
        else:
            self.output_layer = Sequential(
                BatchNorm2d(output_channel), Dropout(0.4), Flatten(),
                Linear(output_channel * 14 * 14, 512),
                BatchNorm1d(512, affine=False))

        modules = []
        for block in blocks:
            for bottleneck in block:
                modules.append(
                    unit_module(bottleneck.in_channel, bottleneck.depth,
                                bottleneck.stride))
        self.body = Sequential(*modules)

        initialize_weights(self.modules())

    def forward(self, x):
        x = self.input_layer(x)
        x = self.body(x)
        x = self.output_layer(x)
        return x


 def IR_18(input_size):
    """ Constructs a ir-18 model.
    """
    model = Backbone(input_size, 18, 'ir')

    return model


 def IR_34(input_size):
    """ Constructs a ir-34 model.
    """
    model = Backbone(input_size, 34, 'ir')

    return model


 def IR_50(input_size):
    """ Constructs a ir-50 model.
    """
    model = Backbone(input_size, 50, 'ir')

    return model


 def IR_101(input_size):
    """ Constructs a ir-101 model.
    """
    model = Backbone(input_size, 100, 'ir')

    return model


 def IR_152(input_size):
    """ Constructs a ir-152 model.
    """
    model = Backbone(input_size, 152, 'ir')

    return model


 def IR_200(input_size):
    """ Constructs a ir-200 model.
    """
    model = Backbone(input_size, 200, 'ir')

    return model


 def IR_SE_50(input_size):
    """ Constructs a ir_se-50 model.
    """
    model = Backbone(input_size, 50, 'ir_se')

    return model


 def IR_SE_101(input_size):
    """ Constructs a ir_se-101 model.
    """
    model = Backbone(input_size, 100, 'ir_se')

    return model


 def IR_SE_152(input_size):
    """ Constructs a ir_se-152 model.
    """
    model = Backbone(input_size, 152, 'ir_se')

    return model


 def IR_SE_200(input_size):
    """ Constructs a ir_se-200 model.
    """
    model = Backbone(input_size, 200, 'ir_se')

    return model
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
@@ -0,0 +1,162 @@
 # based on:
 # https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_resnet.py
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                      MaxPool2d, Module, ReLU, Sequential)

 from .common import initialize_weights


 def conv3x3(in_planes, out_planes, stride=1):
    """ 3x3 convolution with padding
    """
    return Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=1,
        bias=False)


 def conv1x1(in_planes, out_planes, stride=1):
    """ 1x1 convolution
    """
    return Conv2d(
        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


 class Bottleneck(Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = BatchNorm2d(planes * self.expansion)
        self.relu = ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


 class ResNet(Module):
    """ ResNet backbone
    """

    def __init__(self, input_size, block, layers, zero_init_residual=True):
        """ Args:
            input_size: input_size of backbone
            block: block function
            layers: layers in each block
        """
        super(ResNet, self).__init__()
        assert input_size[0] in [112, 224],\
            'input_size should be [112, 112] or [224, 224]'
        self.inplanes = 64
        self.conv1 = Conv2d(
            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = BatchNorm2d(64)
        self.relu = ReLU(inplace=True)
        self.maxpool = MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.bn_o1 = BatchNorm2d(2048)
        self.dropout = Dropout()
        if input_size[0] == 112:
            self.fc = Linear(2048 * 4 * 4, 512)
        else:
            self.fc = Linear(2048 * 7 * 7, 512)
        self.bn_o2 = BatchNorm1d(512)

        initialize_weights(self.modules)
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.bn_o1(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = self.bn_o2(x)

        return x


 def ResNet_50(input_size, **kwargs):
    """ Constructs a ResNet-50 model.
    """
    model = ResNet(input_size, Bottleneck, [3, 4, 6, 3], **kwargs)

    return model


 def ResNet_101(input_size, **kwargs):
    """ Constructs a ResNet-101 model.
    """
    model = ResNet(input_size, Bottleneck, [3, 4, 23, 3], **kwargs)

    return model


 def ResNet_152(input_size, **kwargs):
    """ Constructs a ResNet-152 model.
    """
    model = ResNet(input_size, Bottleneck, [3, 8, 36, 3], **kwargs)

    return model
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -13,6 +13,7 @@ class OutputKeys(object):
    POSES = 'poses'
    CAPTION = 'caption'
    BOXES = 'boxes'
    KEYPOINTS = 'keypoints'
    MASKS = 'masks'
    TEXT = 'text'
    POLYGONS = 'polygons'
@@ -55,6 +56,31 @@ TASK_OUTPUTS = {
    Tasks.object_detection:
    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],

    # face detection result for single sample
    #   {
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #       "boxes": [
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #       ],
    #       "keypoints": [
    #           [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
    #           [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
    #           [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
    #           [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
    #       ],
    #   }
    Tasks.face_detection:
    [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],

    # face recognition result for single sample
    #   {
    #       "img_embedding": np.array with shape [1, D],
    #   }
    Tasks.face_recognition: [OutputKeys.IMG_EMBEDDING],

    # instance segmentation result for single sample
    #   {
    #       "scores": [0.9, 0.1, 0.05, 0.05],
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -255,7 +255,11 @@ class Pipeline(ABC):
        elif isinstance(data, InputFeatures):
            return data
        else:
            raise ValueError(f'Unsupported data type {type(data)}')
            import mmcv
            if isinstance(data, mmcv.parallel.data_container.DataContainer):
                return data
            else:
                raise ValueError(f'Unsupported data type {type(data)}')

    def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
        preprocess_params = kwargs.get('preprocess_params')
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -80,6 +80,10 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.text_to_image_synthesis:
    (Pipelines.text_to_image_synthesis,
     'damo/cv_imagen_text-to-image-synthesis_tiny'),
    Tasks.face_detection: (Pipelines.face_detection,
                           'damo/cv_resnet_facedetection_scrfd10gkps'),
    Tasks.face_recognition: (Pipelines.face_recognition,
                             'damo/cv_ir101_facerecognition_cfglint'),
    Tasks.video_multi_modal_embedding:
    (Pipelines.video_multi_modal_embedding,
     'damo/multi_modal_clip_vtretrival_msrvtt_53'),
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -5,44 +5,50 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .action_recognition_pipeline import ActionRecognitionPipeline
    from .animal_recog_pipeline import AnimalRecogPipeline
    from .cmdssl_video_embedding_pipleline import CMDSSLVideoEmbeddingPipeline
    from .live_category_pipeline import LiveCategoryPipeline
    from .image_classification_pipeline import GeneralImageClassificationPipeline
    from .animal_recognition_pipeline import AnimalRecognitionPipeline
    from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
    from .face_detection_pipeline import FaceDetectionPipeline
    from .face_recognition_pipeline import FaceRecognitionPipeline
    from .face_image_generation_pipeline import FaceImageGenerationPipeline
    from .image_cartoon_pipeline import ImageCartoonPipeline
    from .image_classification_pipeline import GeneralImageClassificationPipeline
    from .image_denoise_pipeline import ImageDenoisePipeline
    from .image_color_enhance_pipeline import ImageColorEnhancePipeline
    from .image_colorization_pipeline import ImageColorizationPipeline
    from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
    from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
    from .video_category_pipeline import VideoCategoryPipeline
    from .image_matting_pipeline import ImageMattingPipeline
    from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
    from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
    from .style_transfer_pipeline import StyleTransferPipeline
    from .live_category_pipeline import LiveCategoryPipeline
    from .ocr_detection_pipeline import OCRDetectionPipeline
    from .video_category_pipeline import VideoCategoryPipeline
    from .virtual_tryon_pipeline import VirtualTryonPipeline
 else:
    _import_structure = {
        'action_recognition_pipeline': ['ActionRecognitionPipeline'],
        'animal_recog_pipeline': ['AnimalRecogPipeline'],
        'cmdssl_video_embedding_pipleline': ['CMDSSLVideoEmbeddingPipeline'],
        'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
        'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
        'face_detection_pipeline': ['FaceDetectionPipeline'],
        'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
        'face_recognition_pipeline': ['FaceRecognitionPipeline'],
        'image_classification_pipeline':
        ['GeneralImageClassificationPipeline'],
        'image_cartoon_pipeline': ['ImageCartoonPipeline'],
        'image_denoise_pipeline': ['ImageDenoisePipeline'],
        'image_color_enhance_pipeline': ['ImageColorEnhancePipeline'],
        'virtual_tryon_pipeline': ['VirtualTryonPipeline'],
        'image_colorization_pipeline': ['ImageColorizationPipeline'],
        'image_super_resolution_pipeline': ['ImageSuperResolutionPipeline'],
        'image_denoise_pipeline': ['ImageDenoisePipeline'],
        'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
        'image_cartoon_pipeline': ['ImageCartoonPipeline'],
        'image_matting_pipeline': ['ImageMattingPipeline'],
        'style_transfer_pipeline': ['StyleTransferPipeline'],
        'ocr_detection_pipeline': ['OCRDetectionPipeline'],
        'image_instance_segmentation_pipeline':
        ['ImageInstanceSegmentationPipeline'],
        'video_category_pipeline': ['VideoCategoryPipeline'],
        'image_matting_pipeline': ['ImageMattingPipeline'],
        'image_super_resolution_pipeline': ['ImageSuperResolutionPipeline'],
        'image_to_image_translation_pipeline':
        ['Image2ImageTranslationPipeline'],
        'live_category_pipeline': ['LiveCategoryPipeline'],
        'ocr_detection_pipeline': ['OCRDetectionPipeline'],
        'style_transfer_pipeline': ['StyleTransferPipeline'],
        'video_category_pipeline': ['VideoCategoryPipeline'],
        'virtual_tryon_pipeline': ['VirtualTryonPipeline'],
    }

    import sys
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -23,7 +23,7 @@ class ActionRecognitionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a action recognition pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -22,11 +22,11 @@ logger = get_logger()

@PIPELINES.register_module(
    Tasks.image_classification, module_name=Pipelines.animal_recognation)
 class AnimalRecogPipeline(Pipeline):
 class AnimalRecognitionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a animal recognition pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py
+++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py
@@ -24,7 +24,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a CMDSSL Video Embedding pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_detection_pipeline.py
@@ -0,0 +1,105 @@
 import os.path as osp
 from typing import Any, Dict

 import cv2
 import numpy as np
 import PIL
 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.face_detection, module_name=Pipelines.face_detection)
 class FaceDetectionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a face detection pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        from mmcv import Config
        from mmcv.parallel import MMDataParallel
        from mmcv.runner import load_checkpoint
        from mmdet.models import build_detector
        from modelscope.models.cv.face_detection.mmdet_patch.datasets import RetinaFaceDataset
        from modelscope.models.cv.face_detection.mmdet_patch.datasets.pipelines import RandomSquareCrop
        from modelscope.models.cv.face_detection.mmdet_patch.models.backbones import ResNetV1e
        from modelscope.models.cv.face_detection.mmdet_patch.models.dense_heads import SCRFDHead
        from modelscope.models.cv.face_detection.mmdet_patch.models.detectors import SCRFD
        cfg = Config.fromfile(osp.join(model, 'mmcv_scrfd_10g_bnkps.py'))
        detector = build_detector(
            cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
        logger.info(f'loading model from {ckpt_path}')
        device = torch.device(
            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
        load_checkpoint(detector, ckpt_path, map_location=device)
        detector = MMDataParallel(detector, device_ids=[0])
        detector.eval()
        self.detector = detector
        logger.info('load model done')

    def preprocess(self, input: Input) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input)
        img = img.astype(np.float32)
        pre_pipeline = [
            dict(
                type='MultiScaleFlipAug',
                img_scale=(640, 640),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip', flip_ratio=0.0),
                    dict(
                        type='Normalize',
                        mean=[127.5, 127.5, 127.5],
                        std=[128.0, 128.0, 128.0],
                        to_rgb=False),
                    dict(type='Pad', size=(640, 640), pad_val=0),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]
        from mmdet.datasets.pipelines import Compose
        pipeline = Compose(pre_pipeline)
        result = {}
        result['filename'] = ''
        result['ori_filename'] = ''
        result['img'] = img
        result['img_shape'] = img.shape
        result['ori_shape'] = img.shape
        result['img_fields'] = ['img']
        result = pipeline(result)
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:

        result = self.detector(
            return_loss=False,
            rescale=True,
            img=[input['img'][0].unsqueeze(0)],
            img_metas=[[dict(input['img_metas'][0].data)]])
        assert result is not None
        result = result[0][0]
        bboxes = result[:, :4].tolist()
        kpss = result[:, 5:].tolist()
        scores = result[:, 4].tolist()
        return {
            OutputKeys.SCORES: scores,
            OutputKeys.BOXES: bboxes,
            OutputKeys.KEYPOINTS: kpss
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/face_image_generation_pipeline.py
+++ b/modelscope/pipelines/cv/face_image_generation_pipeline.py
@@ -24,7 +24,7 @@ class FaceImageGenerationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a kws pipeline for prediction
        use `model` to create a face image generation pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -0,0 +1,130 @@
 import os.path as osp
 from typing import Any, Dict

 import cv2
 import numpy as np
 import PIL
 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_recognition.align_face import align_face
 from modelscope.models.cv.face_recognition.torchkit.backbone import get_model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.face_recognition, module_name=Pipelines.face_recognition)
 class FaceRecognitionPipeline(Pipeline):

    def __init__(self, model: str, face_detection: Pipeline, **kwargs):
        """
        use `model` to create a face recognition pipeline for prediction
        Args:
            model: model id on modelscope hub.
            face_detecion: pipeline for face detection and face alignment before recognition
        """

        # face recong model
        super().__init__(model=model, **kwargs)
        device = torch.device(
            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
        self.device = device
        face_model = get_model('IR_101')([112, 112])
        face_model.load_state_dict(
            torch.load(
                osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE),
                map_location=device))
        face_model = face_model.to(device)
        face_model.eval()
        self.face_model = face_model
        logger.info('face recognition model loaded!')
        # face detect pipeline
        self.face_detection = face_detection

    def _choose_face(self,
                     det_result,
                     min_face=10,
                     top_face=1,
                     center_face=False):
        '''
        choose face with maximum area
        Args:
            det_result: output of face detection pipeline
            min_face: minimum size of valid face w/h
            top_face: take faces with top max areas
            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
        '''
        bboxes = np.array(det_result[OutputKeys.BOXES])
        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
        # scores = np.array(det_result[OutputKeys.SCORES])
        if bboxes.shape[0] == 0:
            logger.info('No face detected!')
            return None
        # face idx with enough size
        face_idx = []
        for i in range(bboxes.shape[0]):
            box = bboxes[i]
            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
                face_idx += [i]
        if len(face_idx) == 0:
            logger.info(
                f'Face size not enough, less than {min_face}x{min_face}!')
            return None
        bboxes = bboxes[face_idx]
        landmarks = landmarks[face_idx]
        # find max faces
        boxes = np.array(bboxes)
        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        sort_idx = np.argsort(area)[-top_face:]
        # find center face
        if top_face > 1 and center_face and bboxes.shape[0] > 1:
            img_center = [img.shape[1] // 2, img.shape[0] // 2]
            min_dist = float('inf')
            sel_idx = -1
            for _idx in sort_idx:
                box = boxes[_idx]
                dist = np.square(
                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
                if dist < min_dist:
                    min_dist = dist
                    sel_idx = _idx
            sort_idx = [sel_idx]
        main_idx = sort_idx[-1]
        return bboxes[main_idx], landmarks[main_idx]

    def preprocess(self, input: Input) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input)
        img = img[:, :, ::-1]
        det_result = self.face_detection(img.copy())
        rtn = self._choose_face(det_result)
        face_img = None
        if rtn is not None:
            _, face_lmks = rtn
            face_lmks = face_lmks.reshape(5, 2)
            align_img, _ = align_face(img, (112, 112), face_lmks)
            face_img = align_img[:, :, ::-1]  # to rgb
            face_img = np.transpose(face_img, axes=(2, 0, 1))
            face_img = (face_img / 255. - 0.5) / 0.5
            face_img = face_img.astype(np.float32)
        result = {}
        result['img'] = face_img
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        assert input['img'] is not None
        img = input['img'].unsqueeze(0)
        emb = self.face_model(img).detach().cpu().numpy()
        emb /= np.sqrt(np.sum(emb**2, -1, keepdims=True))  # l2 norm
        return {OutputKeys.IMG_EMBEDDING: emb}

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -30,7 +30,7 @@ class ImageCartoonPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a image cartoon pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py
+++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
@@ -27,7 +27,7 @@ class ImageColorEnhancePipeline(Pipeline):
                     ImageColorEnhanceFinetunePreprocessor] = None,
                 **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` and `preprocessor` to create a image color enhance pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/image_colorization_pipeline.py
+++ b/modelscope/pipelines/cv/image_colorization_pipeline.py
@@ -25,7 +25,7 @@ class ImageColorizationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a kws pipeline for prediction
        use `model` to create a image colorization pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -21,7 +21,7 @@ class ImageMattingPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a image matting pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/image_super_resolution_pipeline.py
+++ b/modelscope/pipelines/cv/image_super_resolution_pipeline.py
@@ -23,7 +23,7 @@ class ImageSuperResolutionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a kws pipeline for prediction
        use `model` to create a image super resolution pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -41,7 +41,7 @@ class OCRDetectionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a OCR detection pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/style_transfer_pipeline.py
+++ b/modelscope/pipelines/cv/style_transfer_pipeline.py
@@ -21,7 +21,7 @@ class StyleTransferPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` and `preprocessor` to create a kws pipeline for prediction
        use `model` to create a style transfer pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/pipelines/cv/virtual_tryon_pipeline.py
+++ b/modelscope/pipelines/cv/virtual_tryon_pipeline.py
@@ -25,7 +25,7 @@ class VirtualTryonPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model`  to create a kws pipeline for prediction
        use `model` to create a virtual tryon pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -28,6 +28,8 @@ class CVTasks(object):
    ocr_detection = 'ocr-detection'
    action_recognition = 'action-recognition'
    video_embedding = 'video-embedding'
    face_detection = 'face-detection'
    face_recognition = 'face-recognition'
    image_color_enhance = 'image-color-enhance'
    virtual_tryon = 'virtual-tryon'
    image_colorization = 'image-colorization'
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -0,0 +1,84 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 import tempfile
 import unittest

 import cv2
 import numpy as np

 from modelscope.fileio import File
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.test_utils import test_level


 class FaceDetectionTest(unittest.TestCase):

    def setUp(self) -> None:
        self.model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'

    def show_result(self, img_path, bboxes, kpss, scores):
        bboxes = np.array(bboxes)
        kpss = np.array(kpss)
        scores = np.array(scores)
        img = cv2.imread(img_path)
        assert img is not None, f"Can't read img: {img_path}"
        for i in range(len(scores)):
            bbox = bboxes[i].astype(np.int32)
            kps = kpss[i].reshape(-1, 2).astype(np.int32)
            score = scores[i]
            x1, y1, x2, y2 = bbox
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
            for kp in kps:
                cv2.circle(img, tuple(kp), 1, (0, 0, 255), 1)
            cv2.putText(
                img,
                f'{score:.2f}', (x1, y2),
                1,
                1.0, (0, 255, 0),
                thickness=1,
                lineType=8)
        cv2.imwrite('result.png', img)
        print(
            f'Found {len(scores)} faces, output written to {osp.abspath("result.png")}'
        )

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_dataset(self):
        input_location = ['data/test/images/face_detection.png']
        # alternatively:
        # input_location = '/dir/to/images'

        dataset = MsDataset.load(input_location, target='image')
        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
        # note that for dataset output, the inference-output is a Generator that can be iterated.
        result = face_detection(dataset)
        result = next(result)
        self.show_result(input_location[0], result[OutputKeys.BOXES],
                         result[OutputKeys.KEYPOINTS],
                         result[OutputKeys.SCORES])

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub(self):
        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
        img_path = 'data/test/images/face_detection.png'

        result = face_detection(img_path)
        self.show_result(img_path, result[OutputKeys.BOXES],
                         result[OutputKeys.KEYPOINTS],
                         result[OutputKeys.SCORES])

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_modelhub_default_model(self):
        face_detection = pipeline(Tasks.face_detection)
        img_path = 'data/test/images/face_detection.png'
        result = face_detection(img_path)
        self.show_result(img_path, result[OutputKeys.BOXES],
                         result[OutputKeys.KEYPOINTS],
                         result[OutputKeys.SCORES])


 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_face_recognition.py
+++ b/tests/pipelines/test_face_recognition.py
@@ -0,0 +1,42 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 import tempfile
 import unittest

 import cv2
 import numpy as np

 from modelscope.fileio import File
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.test_utils import test_level


 class FaceRecognitionTest(unittest.TestCase):

    def setUp(self) -> None:
        self.recog_model_id = 'damo/cv_ir101_facerecognition_cfglint'
        self.det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_face_compare(self):
        img1 = 'data/test/images/face_recognition_1.png'
        img2 = 'data/test/images/face_recognition_2.png'

        face_detection = pipeline(
            Tasks.face_detection, model=self.det_model_id)
        face_recognition = pipeline(
            Tasks.face_recognition,
            face_detection=face_detection,
            model=self.recog_model_id)
        # note that for dataset output, the inference-output is a Generator that can be iterated.
        emb1 = face_recognition(img1)[OutputKeys.IMG_EMBEDDING]
        emb2 = face_recognition(img2)[OutputKeys.IMG_EMBEDDING]
        sim = np.dot(emb1[0], emb2[0])
        print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')


 if __name__ == '__main__':
    unittest.main()