Merge branch 'master' into ofa/finetune

# Conflicts: # modelscope/preprocessors/multi_modal.py # modelscope/trainers/trainer.py # tests/pipelines/test_ofa_tasks.py
3 years ago · d979b90439
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -36,6 +36,7 @@ do
             -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
             -e TEST_LEVEL=$TEST_LEVEL \
             -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
             -e MODEL_TAG_URL=$MODEL_TAG_URL \
             --workdir=$CODE_DIR_IN_CONTAINER \
             --net host  \
             ${IMAGE_NAME}:${IMAGE_VERSION} \
--- a/data/test/images/mog_face_detection.jpg
+++ b/data/test/images/mog_face_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
 size 87228
--- a/data/test/images/mtcnn_face_detection.jpg
+++ b/data/test/images/mtcnn_face_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
 size 87228
--- a/data/test/images/multimodal_similarity.jpg
+++ b/data/test/images/multimodal_similarity.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:1f24abbba43782d733dedbb0b4f416635af50263862e5632963ac9263e430555
 size 88542
--- a/data/test/images/ulfd_face_detection.jpg
+++ b/data/test/images/ulfd_face_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
 size 87228
--- a/data/test/videos/mask_dir/mask_00000_00320.png
+++ b/data/test/videos/mask_dir/mask_00000_00320.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:b158f6029d9763d7f84042f7c5835f398c688fdbb6b3f4fe6431101d4118c66c
 size 2766
--- a/data/test/videos/mask_dir/mask_00321_00633.png
+++ b/data/test/videos/mask_dir/mask_00321_00633.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:0dcf46b93077e2229ab69cd6ddb80e2689546c575ee538bb2033fee1124ef3e3
 size 2761
--- a/data/test/videos/video_inpainting_test.mp4
+++ b/data/test/videos/video_inpainting_test.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9c9870df5a86acaaec67063183dace795479cd0f05296f13058995f475149c56
 size 2957783
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -75,7 +75,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash

 # install special package
 RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0
 RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 ipykernel && \
    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
    pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn

 RUN if [ "$USE_GPU" = "True" ] ; then \
        pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
--- a/modelscope/exporters/init.py
+++ b/modelscope/exporters/init.py
@@ -0,0 +1,4 @@
 from .base import Exporter
 from .builder import build_exporter
 from .nlp import SbertForSequenceClassificationExporter
 from .torch_model_exporter import TorchModelExporter
--- a/modelscope/exporters/base.py
+++ b/modelscope/exporters/base.py
@@ -0,0 +1,53 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from abc import ABC, abstractmethod

 from modelscope.models import Model
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import ModelFile
 from .builder import build_exporter


 class Exporter(ABC):
    """Exporter base class to output model to onnx, torch_script, graphdef, etc.
    """

    def __init__(self):
        self.model = None

    @classmethod
    def from_model(cls, model: Model, **kwargs):
        """Build the Exporter instance.

        @param model: A model instance. it will be used to output the generated file,
            and the configuration.json in its model_dir field will be used to create the exporter instance.
        @param kwargs: Extra kwargs used to create the Exporter instance.
        @return: The Exporter instance
        """
        cfg = Config.from_file(
            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
        task_name = cfg.task
        model_cfg = cfg.model
        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
            model_cfg.type = model_cfg.model_type
        export_cfg = ConfigDict({'type': model_cfg.type})
        if hasattr(cfg, 'export'):
            export_cfg.update(cfg.export)
        exporter = build_exporter(export_cfg, task_name, kwargs)
        exporter.model = model
        return exporter

    @abstractmethod
    def export_onnx(self, outputs: str, opset=11, **kwargs):
        """Export the model as onnx format files.

        In some cases,  several files may be generated,
        So please return a dict which contains the generated name with the file path.

        @param opset: The version of the ONNX operator set to use.
        @param outputs: The output dir.
        @param kwargs: In this default implementation,
            kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
        @return: A dict contains the model name with the model file path.
        """
        pass
--- a/modelscope/exporters/builder.py
+++ b/modelscope/exporters/builder.py
@@ -0,0 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from modelscope.utils.config import ConfigDict
 from modelscope.utils.registry import Registry, build_from_cfg

 EXPORTERS = Registry('exporters')


 def build_exporter(cfg: ConfigDict,
                   task_name: str = None,
                   default_args: dict = None):
    """ build exporter by the given model config dict

    Args:
        cfg (:obj:`ConfigDict`): config dict for exporter object.
        task_name (str, optional):  task name, refer to
            :obj:`Tasks` for more details
        default_args (dict, optional): Default initialization arguments.
    """
    return build_from_cfg(
        cfg, EXPORTERS, group_key=task_name, default_args=default_args)
--- a/modelscope/exporters/nlp/init.py
+++ b/modelscope/exporters/nlp/init.py
@@ -0,0 +1,2 @@
 from .sbert_for_sequence_classification_exporter import \
    SbertForSequenceClassificationExporter
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -0,0 +1,81 @@
 import os
 from collections import OrderedDict
 from typing import Any, Dict, Mapping, Tuple

 from torch.utils.data.dataloader import default_collate

 from modelscope.exporters.builder import EXPORTERS
 from modelscope.exporters.torch_model_exporter import TorchModelExporter
 from modelscope.metainfo import Models
 from modelscope.preprocessors import Preprocessor, build_preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModeKeys, Tasks


@EXPORTERS.register_module(
    Tasks.sentence_similarity, module_name=Models.structbert)
@EXPORTERS.register_module(
    Tasks.sentiment_classification, module_name=Models.structbert)
@EXPORTERS.register_module(Tasks.nli, module_name=Models.structbert)
@EXPORTERS.register_module(
    Tasks.zero_shot_classification, module_name=Models.structbert)
 class SbertForSequenceClassificationExporter(TorchModelExporter):

    def generate_dummy_inputs(self,
                              shape: Tuple = None,
                              **kwargs) -> Dict[str, Any]:
        """Generate dummy inputs for model exportation to onnx or other formats by tracing.

        @param shape: A tuple of input shape which should have at most two dimensions.
            shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
            shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
        @return: Dummy inputs.
        """

        cfg = Config.from_file(
            os.path.join(self.model.model_dir, 'configuration.json'))
        field_name = Tasks.find_field_by_task(cfg.task)
        if 'type' not in cfg.preprocessor and 'val' in cfg.preprocessor:
            cfg = cfg.preprocessor.val
        else:
            cfg = cfg.preprocessor

        batch_size = 1
        sequence_length = {}
        if shape is not None:
            if len(shape) == 1:
                batch_size = shape[0]
            elif len(shape) == 2:
                batch_size, max_length = shape
                sequence_length = {'sequence_length': max_length}

        cfg.update({
            'model_dir': self.model.model_dir,
            'mode': ModeKeys.TRAIN,
            **sequence_length
        })
        preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
        if preprocessor.pair:
            first_sequence = preprocessor.tokenizer.unk_token
            second_sequence = preprocessor.tokenizer.unk_token
        else:
            first_sequence = preprocessor.tokenizer.unk_token
            second_sequence = None

        batched = []
        for _ in range(batch_size):
            batched.append(preprocessor((first_sequence, second_sequence)))
        return default_collate(batched)

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        dynamic_axis = {0: 'batch', 1: 'sequence'}
        return OrderedDict([
            ('input_ids', dynamic_axis),
            ('attention_mask', dynamic_axis),
            ('token_type_ids', dynamic_axis),
        ])

    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict({'logits': {0: 'batch'}})
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -0,0 +1,247 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from contextlib import contextmanager
 from itertools import chain
 from typing import Any, Dict, Mapping

 import torch
 from torch import nn
 from torch.onnx import export as onnx_export
 from torch.onnx.utils import _decide_input_format

 from modelscope.models import TorchModel
 from modelscope.pipelines.base import collate_fn
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.regress_test_utils import compare_arguments_nested
 from modelscope.utils.tensor_utils import torch_nested_numpify
 from .base import Exporter

 logger = get_logger(__name__)


 class TorchModelExporter(Exporter):
    """The torch base class of exporter.

    This class provides the default implementations for exporting onnx and torch script.
    Each specific model may implement its own exporter by overriding the export_onnx/export_torch_script,
    and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
    """

    def export_onnx(self, outputs: str, opset=11, **kwargs):
        """Export the model as onnx format files.

        In some cases,  several files may be generated,
        So please return a dict which contains the generated name with the file path.

        @param opset: The version of the ONNX operator set to use.
        @param outputs: The output dir.
        @param kwargs: In this default implementation,
            you can pass the arguments needed by _torch_export_onnx, other unrecognized args
            will be carried to generate_dummy_inputs as extra arguments (such as input shape).
        @return: A dict containing the model key - model file path pairs.
        """
        model = self.model
        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
            model = model.model
        onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
        self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
        return {'model': onnx_file}

    def export_torch_script(self, outputs: str, **kwargs):
        """Export the model as torch script files.

        In some cases,  several files may be generated,
        So please return a dict which contains the generated name with the file path.

        @param outputs: The output dir.
        @param kwargs: In this default implementation,
            you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
            will be carried to generate_dummy_inputs as extra arguments (like input shape).
        @return: A dict contains the model name with the model file path.
        """
        model = self.model
        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
            model = model.model
        ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
        # generate ts by tracing
        self._torch_export_torch_script(model, ts_file, **kwargs)
        return {'model': ts_file}

    def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
        @return: Dummy inputs.
        """
        return None

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        """Return an ordered dict contains the model's input arguments name with their dynamic axis.

        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
        """
        return None

    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        """Return an ordered dict contains the model's output arguments name with their dynamic axis.

        About the information of dynamic axis please check the dynamic_axes argument of torch.onnx.export function
        """
        return None

    def _torch_export_onnx(self,
                           model: nn.Module,
                           output: str,
                           opset: int = 11,
                           device: str = 'cpu',
                           validation: bool = True,
                           rtol: float = None,
                           atol: float = None,
                           **kwargs):
        """Export the model to an onnx format file.

        @param model: A torch.nn.Module instance to export.
        @param output: The output file.
        @param opset: The version of the ONNX operator set to use.
        @param device: The device used to forward.
        @param validation: Whether validate the export file.
        @param rtol: The rtol used to regress the outputs.
        @param atol: The atol used to regress the outputs.
        """

        dummy_inputs = self.generate_dummy_inputs(**kwargs)
        inputs = self.inputs
        outputs = self.outputs
        if dummy_inputs is None or inputs is None or outputs is None:
            raise NotImplementedError(
                'Model property dummy_inputs,inputs,outputs must be set.')

        with torch.no_grad():
            model.eval()
            device = torch.device(device)
            model.to(device)
            dummy_inputs = collate_fn(dummy_inputs, device)

            if isinstance(dummy_inputs, Mapping):
                dummy_inputs = dict(dummy_inputs)
            onnx_outputs = list(self.outputs.keys())

            with replace_call():
                onnx_export(
                    model,
                    (dummy_inputs, ),
                    f=output,
                    input_names=list(inputs.keys()),
                    output_names=onnx_outputs,
                    dynamic_axes={
                        name: axes
                        for name, axes in chain(inputs.items(),
                                                outputs.items())
                    },
                    do_constant_folding=True,
                    opset_version=opset,
                )

        if validation:
            try:
                import onnx
                import onnxruntime as ort
            except ImportError:
                logger.warn(
                    'Cannot validate the exported onnx file, because '
                    'the installation of onnx or onnxruntime cannot be found')
                return
            onnx_model = onnx.load(output)
            onnx.checker.check_model(onnx_model)
            ort_session = ort.InferenceSession(output)
            with torch.no_grad():
                model.eval()
                outputs_origin = model.forward(
                    *_decide_input_format(model, dummy_inputs))
            if isinstance(outputs_origin, Mapping):
                outputs_origin = torch_nested_numpify(
                    list(outputs_origin.values()))
            outputs = ort_session.run(
                onnx_outputs,
                torch_nested_numpify(dummy_inputs),
            )

            tols = {}
            if rtol is not None:
                tols['rtol'] = rtol
            if atol is not None:
                tols['atol'] = atol
            if not compare_arguments_nested('Onnx model output match failed',
                                            outputs, outputs_origin, **tols):
                raise RuntimeError(
                    'export onnx failed because of validation error.')

    def _torch_export_torch_script(self,
                                   model: nn.Module,
                                   output: str,
                                   device: str = 'cpu',
                                   validation: bool = True,
                                   rtol: float = None,
                                   atol: float = None,
                                   **kwargs):
        """Export the model to a torch script file.

        @param model: A torch.nn.Module instance to export.
        @param output: The output file.
        @param device: The device used to forward.
        @param validation: Whether validate the export file.
        @param rtol: The rtol used to regress the outputs.
        @param atol: The atol used to regress the outputs.
        """

        model.eval()
        dummy_inputs = self.generate_dummy_inputs(**kwargs)
        if dummy_inputs is None:
            raise NotImplementedError(
                'Model property dummy_inputs must be set.')
        dummy_inputs = collate_fn(dummy_inputs, device)
        if isinstance(dummy_inputs, Mapping):
            dummy_inputs = tuple(dummy_inputs.values())
        with torch.no_grad():
            model.eval()
            with replace_call():
                traced_model = torch.jit.trace(
                    model, dummy_inputs, strict=False)
        torch.jit.save(traced_model, output)

        if validation:
            ts_model = torch.jit.load(output)
            with torch.no_grad():
                model.eval()
                ts_model.eval()
                outputs = ts_model.forward(*dummy_inputs)
                outputs = torch_nested_numpify(outputs)
                outputs_origin = model.forward(*dummy_inputs)
                outputs_origin = torch_nested_numpify(outputs_origin)
            tols = {}
            if rtol is not None:
                tols['rtol'] = rtol
            if atol is not None:
                tols['atol'] = atol
            if not compare_arguments_nested(
                    'Torch script model output match failed', outputs,
                    outputs_origin, **tols):
                raise RuntimeError(
                    'export torch script failed because of validation error.')


@contextmanager
 def replace_call():
    """This function is used to recover the original call method.

    The Model class of modelscope overrides the call method. When exporting to onnx or torchscript, torch will
    prepare the parameters as the prototype of forward method, and trace the call method, this causes
    problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
    back after the tracing was done.
    """

    TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
    yield
    TorchModel.__call__ = TorchModel.call_origin
    del TorchModel.call_origin
--- a/modelscope/fileio/init.py
+++ b/modelscope/fileio/init.py
@@ -1,2 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .file import File, LocalStorage
 from .io import dump, dumps, load
--- a/modelscope/fileio/format/init.py
+++ b/modelscope/fileio/format/init.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .base import FormatHandler
 from .json import JsonHandler
 from .yaml import YamlHandler
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import pickle
 import shutil
@@ -389,7 +391,7 @@ class HubApi:
        cookies = requests.utils.dict_from_cookiejar(cookies)
        r = requests.get(url=datahub_url, cookies=cookies)
        resp = r.json()
        datahub_raise_on_error(datahub_url, resp)
        raise_on_error(resp)
        return resp['Data']

    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from pathlib import Path

 MODELSCOPE_URL_SCHEME = 'http://'
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from http import HTTPStatus

 from requests.exceptions import HTTPError
@@ -60,7 +62,7 @@ def raise_on_error(rsp):
    Args:
        rsp (_type_): The server response
    """
    if rsp['Code'] == HTTPStatus.OK and rsp['Success']:
    if rsp['Code'] == HTTPStatus.OK:
        return True
    else:
        raise RequestError(rsp['Message'])
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import copy
 import os
 import sys
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import subprocess
 from typing import List
@@ -39,17 +41,28 @@ class GitCommandWrapper(metaclass=Singleton):
            subprocess.CompletedProcess: the command response
        """
        logger.debug(' '.join(args))
        git_env = os.environ.copy()
        git_env['GIT_TERMINAL_PROMPT'] = '0'
        response = subprocess.run(
            [self.git_path, *args],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)  # compatible for python3.6
            stderr=subprocess.PIPE,
            env=git_env,
        )  # compatible for python3.6
        try:
            response.check_returncode()
            return response
        except subprocess.CalledProcessError as error:
            raise GitError(
                'stdout: %s, stderr: %s' %
                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
            if response.returncode == 1:
                logger.info('Nothing to commit.')
                return response
            else:
                logger.error(
                    'There are error run git command, you may need to login first.'
                )
                raise GitError('stdout: %s, stderr: %s' %
                               (response.stdout.decode('utf8'),
                                error.stderr.decode('utf8')))

    def config_auth_token(self, repo_dir, auth_token):
        url = self.get_repo_remote_url(repo_dir)
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Optional

@@ -40,6 +42,11 @@ class Repository:
        self.model_dir = model_dir
        self.model_base_dir = os.path.dirname(model_dir)
        self.model_repo_name = os.path.basename(model_dir)

        if not revision:
            err_msg = 'a non-default value of revision cannot be empty.'
            raise InvalidParameter(err_msg)

        if auth_token:
            self.auth_token = auth_token
        else:
@@ -145,10 +152,21 @@ class DatasetRepository:
                The git command line path, if None, we use 'git'
        """
        self.dataset_id = dataset_id
        self.repo_work_dir = repo_work_dir
        self.repo_base_dir = os.path.dirname(repo_work_dir)
        self.repo_name = os.path.basename(repo_work_dir)
        if not repo_work_dir or not isinstance(repo_work_dir, str):
            err_msg = 'dataset_work_dir must be provided!'
            raise InvalidParameter(err_msg)
        self.repo_work_dir = repo_work_dir.rstrip('/')
        if not self.repo_work_dir:
            err_msg = 'dataset_work_dir can not be root dir!'
            raise InvalidParameter(err_msg)
        self.repo_base_dir = os.path.dirname(self.repo_work_dir)
        self.repo_name = os.path.basename(self.repo_work_dir)

        if not revision:
            err_msg = 'a non-default value of revision cannot be empty.'
            raise InvalidParameter(err_msg)
        self.revision = revision

        if auth_token:
            self.auth_token = auth_token
        else:
@@ -199,7 +217,9 @@ class DatasetRepository:
        self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
        self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)

        remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
        remote_url = self._get_remote_url()
        remote_url = self.git_wrapper.remove_token_from_url(remote_url)

        self.git_wrapper.pull(self.repo_work_dir)
        self.git_wrapper.add(self.repo_work_dir, all_files=True)
        self.git_wrapper.commit(self.repo_work_dir, commit_message)
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import tempfile
 from pathlib import Path
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import hashlib
 import os
 import pickle
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import hashlib
 import os
 from typing import Optional
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -35,6 +35,10 @@ class Models(object):
    fer = 'fer'
    retinaface = 'retinaface'
    shop_segmentation = 'shop-segmentation'
    mogface = 'mogface'
    mtcnn = 'mtcnn'
    ulfd = 'ulfd'
    video_inpainting = 'video-inpainting'

    # EasyCV models
    yolox = 'YOLOX'
@@ -51,11 +55,16 @@ class Models(object):
    space_intent = 'space-intent'
    space_modeling = 'space-modeling'
    star = 'star'
    star3 = 'star3'
    tcrf = 'transformer-crf'
    transformer_softmax = 'transformer-softmax'
    lcrf = 'lstm-crf'
    gcnncrf = 'gcnn-crf'
    bart = 'bart'
    gpt3 = 'gpt3'
    plug = 'plug'
    bert_for_ds = 'bert-for-document-segmentation'
    ponet = 'ponet'

    # audio models
    sambert_hifigan = 'sambert-hifigan'
@@ -70,6 +79,7 @@ class Models(object):
    gemm = 'gemm-generative-multi-modal'
    mplug = 'mplug'
    diffusion = 'diffusion-text-to-image-synthesis'
    multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
    team = 'team-multi-modal-similarity'
    video_clip = 'video-clip-multi-modal-embedding'

@@ -77,6 +87,7 @@ class Models(object):
 class TaskModels(object):
    # nlp task
    text_classification = 'text-classification'
    token_classification = 'token-classification'
    information_extraction = 'information-extraction'


@@ -87,6 +98,8 @@ class Heads(object):
    bert_mlm = 'bert-mlm'
    # roberta mlm
    roberta_mlm = 'roberta-mlm'
    # token cls
    token_classification = 'token-classification'
    information_extraction = 'information-extraction'


@@ -121,8 +134,11 @@ class Pipelines(object):
    salient_detection = 'u2net-salient-detection'
    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    ulfd_face_detection = 'manual-face-detection-ulfd'
    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
    retina_face_detection = 'resnet50-face-detection-retinaface'
    mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
    mtcnn_face_detection = 'manual-face-detection-mtcnn'
    live_category = 'live-category'
    general_image_classification = 'vit-base_image-classification_ImageNet-labels'
    daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
@@ -155,16 +171,19 @@ class Pipelines(object):
    text_driven_segmentation = 'text-driven-segmentation'
    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
    shop_segmentation = 'shop-segmentation'
    video_inpainting = 'video-inpainting'

    # nlp tasks
    sentence_similarity = 'sentence-similarity'
    word_segmentation = 'word-segmentation'
    part_of_speech = 'part-of-speech'
    named_entity_recognition = 'named-entity-recognition'
    text_generation = 'text-generation'
    sentiment_analysis = 'sentiment-analysis'
    sentiment_classification = 'sentiment-classification'
    text_classification = 'text-classification'
    fill_mask = 'fill-mask'
    fill_mask_ponet = 'fill-mask-ponet'
    csanmt_translation = 'csanmt-translation'
    nli = 'nli'
    dialog_intent_prediction = 'dialog-intent-prediction'
@@ -172,8 +191,12 @@ class Pipelines(object):
    dialog_state_tracking = 'dialog-state-tracking'
    zero_shot_classification = 'zero-shot-classification'
    text_error_correction = 'text-error-correction'
    plug_generation = 'plug-generation'
    faq_question_answering = 'faq-question-answering'
    conversational_text_to_sql = 'conversational-text-to-sql'
    table_question_answering_pipeline = 'table-question-answering-pipeline'
    sentence_embedding = 'sentence-embedding'
    passage_ranking = 'passage-ranking'
    relation_extraction = 'relation-extraction'
    document_segmentation = 'document-segmentation'

@@ -223,8 +246,11 @@ class Trainers(object):

    # nlp trainers
    bert_sentiment_analysis = 'bert-sentiment-analysis'
    dialog_modeling_trainer = 'dialog-modeling-trainer'
    dialog_intent_trainer = 'dialog-intent-trainer'
    nlp_base_trainer = 'nlp-base-trainer'
    nlp_veco_trainer = 'nlp-veco-trainer'
    nlp_passage_ranking_trainer = 'nlp-passage-ranking-trainer'

    # audio trainers
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -252,6 +278,7 @@ class Preprocessors(object):

    # nlp preprocessor
    sen_sim_tokenizer = 'sen-sim-tokenizer'
    cross_encoder_tokenizer = 'cross-encoder-tokenizer'
    bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
    text_gen_tokenizer = 'text-gen-tokenizer'
    token_cls_tokenizer = 'token-cls-tokenizer'
@@ -264,10 +291,15 @@ class Preprocessors(object):
    sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
    zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
    text_error_correction = 'text-error-correction'
    sentence_embedding = 'sentence-embedding'
    passage_ranking = 'passage-ranking'
    sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
    word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
    fill_mask = 'fill-mask'
    fill_mask_ponet = 'fill-mask-ponet'
    faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
    conversational_text_to_sql = 'conversational-text-to-sql'
    table_question_answering_preprocessor = 'table-question-answering-preprocessor'
    re_tokenizer = 're-tokenizer'
    document_segmentation = 'document-segmentation'

--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from typing import Dict

 from modelscope.metainfo import Metrics
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -14,9 +14,9 @@ from .builder import METRICS, MetricKeys
@METRICS.register_module(
    group_key=default_group, module_name=Metrics.seq_cls_metric)
 class SequenceClassificationMetric(Metric):
    """The metric computation class for sequence classification classes.
    """The metric computation class for sequence classification tasks.

    This metric class calculates accuracy for the whole input batches.
    This metric class calculates accuracy of the whole input batches.
    """

    def __init__(self, *args, **kwargs):
--- a/modelscope/models/audio/aec/layers/activations.py
+++ b/modelscope/models/audio/aec/layers/activations.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import torch.nn as nn

 from .layer_base import LayerBase
--- a/modelscope/models/audio/aec/layers/affine_transform.py
+++ b/modelscope/models/audio/aec/layers/affine_transform.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import numpy as np
 import torch as th
 import torch.nn as nn
--- a/modelscope/models/audio/aec/layers/deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/deep_fsmn.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import numpy as np
 import torch as th
 import torch.nn as nn
--- a/modelscope/models/audio/aec/layers/layer_base.py
+++ b/modelscope/models/audio/aec/layers/layer_base.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import abc
 import re

--- a/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import numpy as np
 import torch as th
 import torch.nn as nn
--- a/modelscope/models/audio/aec/network/loss.py
+++ b/modelscope/models/audio/aec/network/loss.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import torch
 import torch.nn.functional as F

--- a/modelscope/models/audio/aec/network/modulation_loss.py
+++ b/modelscope/models/audio/aec/network/modulation_loss.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import math

 import torch
--- a/modelscope/models/audio/aec/network/se_net.py
+++ b/modelscope/models/audio/aec/network/se_net.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,9 +1,10 @@
 """
 The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
 here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
 and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 # Copyright (c) Alibaba, Inc. and its affiliates.
 #
 # The implementation of class ComplexConv2d, ComplexConvTranspose2d and
 # ComplexBatchNorm2d here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr
 # / Seoul National Univ., ESTsoft ) and publicly available at
 # https://github.com/sweetcocoa/DeepComplexUNetPyTorch

 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,8 +1,10 @@
 """
 The implementation here is modified based on
 Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
 and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 """
 # Copyright (c) Alibaba, Inc. and its affiliates.
 #
 # The implementation here is modified based on
 # Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
 # and publicly available at
 # https://github.com/sweetcocoa/DeepComplexUNetPyTorch

 import torch
 import torch.nn as nn

--- a/modelscope/models/audio/kws/farfield/fsmn.py
+++ b/modelscope/models/audio/kws/farfield/fsmn.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import numpy as np
 import torch
 import torch.nn as nn
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Dict

--- a/modelscope/models/audio/kws/farfield/model_def.py
+++ b/modelscope/models/audio/kws/farfield/model_def.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import math
 import struct
 from enum import Enum
--- a/modelscope/models/base/init.py
+++ b/modelscope/models/base/init.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .base_head import *  # noqa F403
 from .base_model import *  # noqa F403
 from .base_torch_head import *  # noqa F403
--- a/modelscope/models/base/base_head.py
+++ b/modelscope/models/base/base_head.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from abc import ABC, abstractmethod
 from typing import Dict, Union
 from typing import Any, Dict, Union

 from modelscope.models.base.base_model import Model
 from modelscope.utils.config import ConfigDict
@@ -22,25 +22,20 @@ class Head(ABC):
        self.config = ConfigDict(kwargs)

    @abstractmethod
    def forward(self, input: Input) -> Dict[str, Tensor]:
    def forward(self, *args, **kwargs) -> Dict[str, Any]:
        """
        This method will use the output from backbone model to do any
        downstream tasks
        Args:
            input: The tensor output or a model from backbone model
            (text generation need a model as input)
        Returns: The output from downstream taks
        downstream tasks. Recieve The output from backbone model.

        Returns (Dict[str, Any]): The output from downstream task.
        """
        pass

    @abstractmethod
    def compute_loss(self, outputs: Dict[str, Tensor],
                     labels) -> Dict[str, Tensor]:
    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
        """
        compute loss for head during the finetuning
        compute loss for head during the finetuning.

        Args:
            outputs (Dict[str, Tensor]):  the output from the model forward
        Returns:  the loss(Dict[str, Tensor]):
        Returns (Dict[str, Any]): The loss dict
        """
        pass
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -2,7 +2,7 @@
 import os
 import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Callable, Dict, List, Optional, Union
 from typing import Any, Callable, Dict, List, Optional, Union

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
@@ -10,8 +10,6 @@ from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import device_placement, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -27,35 +25,31 @@ class Model(ABC):
        verify_device(device_name)
        self._device_name = device_name

    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        return self.postprocess(self.forward(input))
    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
        return self.postprocess(self.forward(*args, **kwargs))

    @abstractmethod
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
    def forward(self, *args, **kwargs) -> Dict[str, Any]:
        """
        Run the forward pass for a model.

        Args:
            input (Dict[str, Tensor]): the dict of the model inputs for the forward method

        Returns:
            Dict[str, Tensor]: output from the model forward pass
            Dict[str, Any]: output from the model forward pass
        """
        pass

    def postprocess(self, input: Dict[str, Tensor],
                    **kwargs) -> Dict[str, Tensor]:
    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        """ Model specific postprocess and convert model output to
        standard model outputs.

        Args:
            input:  input data
            inputs:  input data

        Return:
            dict of results:  a dict containing outputs of model, each
                output should have the standard output name.
        """
        return input
        return inputs

    @classmethod
    def _instantiate(cls, **kwargs):
@@ -97,7 +91,6 @@ class Model(ABC):
                osp.join(local_model_dir, ModelFile.CONFIGURATION))
        task_name = cfg.task
        model_cfg = cfg.model
        framework = cfg.framework

        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
            model_cfg.type = model_cfg.model_type
@@ -107,9 +100,8 @@ class Model(ABC):
            model_cfg[k] = v
        if device is not None:
            model_cfg.device = device
            with device_placement(framework, device):
                model = build_model(
                    model_cfg, task_name=task_name, default_args=kwargs)
            model = build_model(
                model_cfg, task_name=task_name, default_args=kwargs)
        else:
            model = build_model(
                model_cfg, task_name=task_name, default_args=kwargs)
--- a/modelscope/models/base/base_torch_head.py
+++ b/modelscope/models/base/base_torch_head.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict
 from typing import Any, Dict

 import torch

@@ -18,10 +18,8 @@ class TorchHead(Head, torch.nn.Module):
        super().__init__(**kwargs)
        torch.nn.Module.__init__(self)

    def forward(self, inputs: Dict[str,
                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
    def forward(self, *args, **kwargs) -> Dict[str, Any]:
        raise NotImplementedError

    def compute_loss(self, outputs: Dict[str, torch.Tensor],
                     labels) -> Dict[str, torch.Tensor]:
    def compute_loss(self, *args, **kwargs) -> Dict[str, Any]:
        raise NotImplementedError
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from typing import Any, Dict, Optional, Union
 from typing import Any, Dict

 import torch
 from torch import nn
@@ -21,15 +21,14 @@ class TorchModel(Model, torch.nn.Module):
        super().__init__(model_dir, *args, **kwargs)
        torch.nn.Module.__init__(self)

    def __call__(self, input: Dict[str,
                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
        # Adapting a model with only one dict arg, and the arg name must be input or inputs
        if func_receive_dict_inputs(self.forward):
            return self.postprocess(self.forward(input))
            return self.postprocess(self.forward(args[0], **kwargs))
        else:
            return self.postprocess(self.forward(**input))
            return self.postprocess(self.forward(*args, **kwargs))

    def forward(self, inputs: Dict[str,
                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
    def forward(self, *args, **kwargs) -> Dict[str, Any]:
        raise NotImplementedError

    def post_init(self):
--- a/modelscope/models/cv/action_detection/action_detection_onnx.py
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import os.path as osp
 import shutil
--- a/modelscope/models/cv/face_detection/init.py
+++ b/modelscope/models/cv/face_detection/init.py
@@ -4,11 +4,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .mogface import MogFaceDetector
    from .mtcnn import MtcnnFaceDetector
    from .retinaface import RetinaFaceDetection

    from .ulfd_slim import UlfdFaceDetector
 else:
    _import_structure = {
        'ulfd_slim': ['UlfdFaceDetector'],
        'retinaface': ['RetinaFaceDetection'],
        'mtcnn': ['MtcnnFaceDetector'],
        'mogface': ['MogFaceDetector']
    }

    import sys
--- a/modelscope/models/cv/face_detection/mmdet_patch/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/init.py
@@ -1,5 +1,4 @@
 """
 mmdet_patch is based on
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet,
 all duplicate functions from official mmdetection are removed.
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet
 """
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox
 """
 from .transforms import bbox2result, distance2kps, kps2distance

 __all__ = ['bbox2result', 'distance2kps', 'kps2distance']
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/bbox/transforms.py
 """
 import numpy as np
 import torch
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 """
 from .bbox_nms import multiclass_nms

 __all__ = ['multiclass_nms']
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/core/post_processing/bbox_nms.py
 """
 import torch

--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets
 """
 from .retinaface import RetinaFaceDataset

 __all__ = ['RetinaFaceDataset']
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
 """
 from .transforms import RandomSquareCrop

 __all__ = ['RandomSquareCrop']
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
 """
 import numpy as np
 from mmdet.datasets.builder import PIPELINES
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/retinaface.py
 """
 import numpy as np
 from mmdet.datasets.builder import DATASETS
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/init.py
@@ -1,2 +1,6 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models
 """
 from .dense_heads import *  # noqa: F401,F403
 from .detectors import *  # noqa: F401,F403
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones
 """
 from .resnet import ResNetV1e

 __all__ = ['ResNetV1e']
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/resnet.py
 """
 import torch.nn as nn
 import torch.utils.checkpoint as cp
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads
 """
 from .scrfd_head import SCRFDHead

 __all__ = ['SCRFDHead']
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py
 """
 import numpy as np
 import torch
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/init.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/init.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors
 """
 from .scrfd import SCRFD

 __all__ = ['SCRFD']
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
@@ -1,5 +1,6 @@
 """
 based on https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 """
 import torch
 from mmdet.models.builder import DETECTORS
--- a/modelscope/models/cv/face_detection/mogface/init.py
+++ b/modelscope/models/cv/face_detection/mogface/init.py
@@ -0,0 +1 @@
 from .models.detectors import MogFaceDetector
--- a/modelscope/models/cv/face_detection/mogface/models/init.py
+++ b/modelscope/models/cv/face_detection/mogface/models/init.py
--- a/modelscope/models/cv/face_detection/mogface/models/detectors.py
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -0,0 +1,96 @@
 import os

 import cv2
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 from .mogface import MogFace
 from .utils import MogPriorBox, mogdecode, py_cpu_nms


@MODELS.register_module(Tasks.face_detection, module_name=Models.mogface)
 class MogFaceDetector(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
        self.net = MogFace()
        self.load_model()
        self.net = self.net.to(device)

        self.mean = np.array([[104, 117, 123]])

    def load_model(self, load_to_cpu=False):
        pretrained_dict = torch.load(
            self.model_path, map_location=torch.device('cpu'))
        self.net.load_state_dict(pretrained_dict, strict=False)
        self.net.eval()

    def forward(self, input):
        img_raw = input['img']
        img = np.array(img_raw.cpu().detach())
        img = img[:, :, ::-1]

        im_height, im_width = img.shape[:2]
        ss = 1.0
        # tricky
        if max(im_height, im_width) > 1500:
            ss = 1000.0 / max(im_height, im_width)
            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
            im_height, im_width = img.shape[:2]

        scale = torch.Tensor(
            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
        img -= np.array([[103.53, 116.28, 123.675]])
        img /= np.array([[57.375, 57.120003, 58.395]])
        img /= 255
        img = img[:, :, ::-1].copy()
        img = img.transpose(2, 0, 1)
        img = torch.from_numpy(img).unsqueeze(0)
        img = img.to(self.device)
        scale = scale.to(self.device)

        conf, loc = self.net(img)  # forward pass

        confidence_threshold = 0.82
        nms_threshold = 0.4
        top_k = 5000
        keep_top_k = 750

        priorbox = MogPriorBox(scale_list=[0.68])
        priors = priorbox(im_height, im_width)
        priors = torch.tensor(priors).to(self.device)
        prior_data = priors.data

        boxes = mogdecode(loc.data.squeeze(0), prior_data)
        boxes = boxes.cpu().numpy()
        scores = conf.squeeze(0).data.cpu().numpy()[:, 0]

        # ignore low scores
        inds = np.where(scores > confidence_threshold)[0]
        boxes = boxes[inds]
        scores = scores[inds]

        # keep top-K before NMS
        order = scores.argsort()[::-1][:top_k]
        boxes = boxes[order]
        scores = scores[order]

        # do NMS
        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
            np.float32, copy=False)
        keep = py_cpu_nms(dets, nms_threshold)
        dets = dets[keep, :]

        # keep top-K faster NMS
        dets = dets[:keep_top_k, :]

        return dets / ss
--- a/modelscope/models/cv/face_detection/mogface/models/mogface.py
+++ b/modelscope/models/cv/face_detection/mogface/models/mogface.py
@@ -0,0 +1,135 @@
 # --------------------------------------------------------
 # The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
 # https://github.com/damo-cv/MogFace
 # --------------------------------------------------------
 import torch.nn as nn
 import torch.nn.functional as F

 from .mogprednet import MogPredNet
 from .resnet import ResNet


 class MogFace(nn.Module):

    def __init__(self):
        super(MogFace, self).__init__()
        self.backbone = ResNet(depth=101)
        self.fpn = LFPN()
        self.pred_net = MogPredNet()

    def forward(self, x):
        feature_list = self.backbone(x)
        fpn_list = self.fpn(feature_list)
        pyramid_feature_list = fpn_list[0]
        conf, loc = self.pred_net(pyramid_feature_list)
        return conf, loc


 class FeatureFusion(nn.Module):

    def __init__(self, lat_ch=256, **channels):
        super(FeatureFusion, self).__init__()
        self.main_conv = nn.Conv2d(channels['main'], lat_ch, kernel_size=1)

    def forward(self, up, main):
        main = self.main_conv(main)
        _, _, H, W = main.size()
        res = F.upsample(up, scale_factor=2, mode='bilinear')
        if res.size(2) != main.size(2) or res.size(3) != main.size(3):
            res = res[:, :, 0:H, 0:W]
        res = res + main
        return res


 class LFPN(nn.Module):

    def __init__(self,
                 c2_out_ch=256,
                 c3_out_ch=512,
                 c4_out_ch=1024,
                 c5_out_ch=2048,
                 c6_mid_ch=512,
                 c6_out_ch=512,
                 c7_mid_ch=128,
                 c7_out_ch=256,
                 out_dsfd_ft=True):
        super(LFPN, self).__init__()
        self.out_dsfd_ft = out_dsfd_ft
        if self.out_dsfd_ft:
            dsfd_module = []
            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
            dsfd_module.append(nn.Conv2d(512, 256, kernel_size=3, padding=1))
            dsfd_module.append(nn.Conv2d(1024, 256, kernel_size=3, padding=1))
            dsfd_module.append(nn.Conv2d(2048, 256, kernel_size=3, padding=1))
            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
            dsfd_module.append(nn.Conv2d(256, 256, kernel_size=3, padding=1))
            self.dsfd_modules = nn.ModuleList(dsfd_module)

        c6_input_ch = c5_out_ch
        self.c6 = nn.Sequential(*[
            nn.Conv2d(
                c6_input_ch,
                c6_mid_ch,
                kernel_size=1,
            ),
            nn.BatchNorm2d(c6_mid_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(
                c6_mid_ch, c6_out_ch, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm2d(c6_out_ch),
            nn.ReLU(inplace=True)
        ])
        self.c7 = nn.Sequential(*[
            nn.Conv2d(
                c6_out_ch,
                c7_mid_ch,
                kernel_size=1,
            ),
            nn.BatchNorm2d(c7_mid_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(
                c7_mid_ch, c7_out_ch, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm2d(c7_out_ch),
            nn.ReLU(inplace=True)
        ])

        self.p2_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.p3_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.p4_lat = nn.Conv2d(256, 256, kernel_size=3, padding=1)

        self.c5_lat = nn.Conv2d(c6_input_ch, 256, kernel_size=3, padding=1)
        self.c6_lat = nn.Conv2d(c6_out_ch, 256, kernel_size=3, padding=1)
        self.c7_lat = nn.Conv2d(c7_out_ch, 256, kernel_size=3, padding=1)

        self.ff_c5_c4 = FeatureFusion(main=c4_out_ch)
        self.ff_c4_c3 = FeatureFusion(main=c3_out_ch)
        self.ff_c3_c2 = FeatureFusion(main=c2_out_ch)

    def forward(self, feature_list):
        c2, c3, c4, c5 = feature_list
        c6 = self.c6(c5)
        c7 = self.c7(c6)

        c5 = self.c5_lat(c5)
        c6 = self.c6_lat(c6)
        c7 = self.c7_lat(c7)

        if self.out_dsfd_ft:
            dsfd_fts = []
            dsfd_fts.append(self.dsfd_modules[0](c2))
            dsfd_fts.append(self.dsfd_modules[1](c3))
            dsfd_fts.append(self.dsfd_modules[2](c4))
            dsfd_fts.append(self.dsfd_modules[3](feature_list[-1]))
            dsfd_fts.append(self.dsfd_modules[4](c6))
            dsfd_fts.append(self.dsfd_modules[5](c7))

        p4 = self.ff_c5_c4(c5, c4)
        p3 = self.ff_c4_c3(p4, c3)
        p2 = self.ff_c3_c2(p3, c2)

        p2 = self.p2_lat(p2)
        p3 = self.p3_lat(p3)
        p4 = self.p4_lat(p4)

        if self.out_dsfd_ft:
            return ([p2, p3, p4, c5, c6, c7], dsfd_fts)
--- a/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
+++ b/modelscope/models/cv/face_detection/mogface/models/mogprednet.py
@@ -0,0 +1,164 @@
 # --------------------------------------------------------
 # The implementation is also open-sourced by the authors as Yang Liu, and is available publicly on
 # https://github.com/damo-cv/MogFace
 # --------------------------------------------------------
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class conv_bn(nn.Module):
    """docstring for conv"""

    def __init__(self, in_plane, out_plane, kernel_size, stride, padding):
        super(conv_bn, self).__init__()
        self.conv1 = nn.Conv2d(
            in_plane,
            out_plane,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding)
        self.bn1 = nn.BatchNorm2d(out_plane)

    def forward(self, x):
        x = self.conv1(x)
        return self.bn1(x)


 class SSHContext(nn.Module):

    def __init__(self, channels, Xchannels=256):
        super(SSHContext, self).__init__()

        self.conv1 = nn.Conv2d(
            channels, Xchannels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(
            channels,
            Xchannels // 2,
            kernel_size=3,
            dilation=2,
            stride=1,
            padding=2)
        self.conv2_1 = nn.Conv2d(
            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)
        self.conv2_2 = nn.Conv2d(
            Xchannels // 2,
            Xchannels // 2,
            kernel_size=3,
            dilation=2,
            stride=1,
            padding=2)
        self.conv2_2_1 = nn.Conv2d(
            Xchannels // 2, Xchannels // 2, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        x1 = F.relu(self.conv1(x), inplace=True)
        x2 = F.relu(self.conv2(x), inplace=True)
        x2_1 = F.relu(self.conv2_1(x2), inplace=True)
        x2_2 = F.relu(self.conv2_2(x2), inplace=True)
        x2_2 = F.relu(self.conv2_2_1(x2_2), inplace=True)

        return torch.cat([x1, x2_1, x2_2], 1)


 class DeepHead(nn.Module):

    def __init__(self,
                 in_channel=256,
                 out_channel=256,
                 use_gn=False,
                 num_conv=4):
        super(DeepHead, self).__init__()
        self.use_gn = use_gn
        self.num_conv = num_conv
        self.conv1 = nn.Conv2d(in_channel, out_channel, 3, 1, 1)
        self.conv2 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
        self.conv3 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
        self.conv4 = nn.Conv2d(out_channel, out_channel, 3, 1, 1)
        if self.use_gn:
            self.gn1 = nn.GroupNorm(16, out_channel)
            self.gn2 = nn.GroupNorm(16, out_channel)
            self.gn3 = nn.GroupNorm(16, out_channel)
            self.gn4 = nn.GroupNorm(16, out_channel)

    def forward(self, x):
        if self.use_gn:
            x1 = F.relu(self.gn1(self.conv1(x)), inplace=True)
            x2 = F.relu(self.gn2(self.conv1(x1)), inplace=True)
            x3 = F.relu(self.gn3(self.conv1(x2)), inplace=True)
            x4 = F.relu(self.gn4(self.conv1(x3)), inplace=True)
        else:
            x1 = F.relu(self.conv1(x), inplace=True)
            x2 = F.relu(self.conv1(x1), inplace=True)
            if self.num_conv == 2:
                return x2
            x3 = F.relu(self.conv1(x2), inplace=True)
            x4 = F.relu(self.conv1(x3), inplace=True)

        return x4


 class MogPredNet(nn.Module):

    def __init__(self,
                 num_anchor_per_pixel=1,
                 num_classes=1,
                 input_ch_list=[256, 256, 256, 256, 256, 256],
                 use_deep_head=True,
                 deep_head_with_gn=True,
                 use_ssh=True,
                 deep_head_ch=512):
        super(MogPredNet, self).__init__()
        self.num_classes = num_classes
        self.use_deep_head = use_deep_head
        self.deep_head_with_gn = deep_head_with_gn

        self.use_ssh = use_ssh

        self.deep_head_ch = deep_head_ch

        if self.use_ssh:
            self.conv_SSH = SSHContext(input_ch_list[0],
                                       self.deep_head_ch // 2)

        if self.use_deep_head:
            if self.deep_head_with_gn:
                self.deep_loc_head = DeepHead(
                    self.deep_head_ch, self.deep_head_ch, use_gn=True)
                self.deep_cls_head = DeepHead(
                    self.deep_head_ch, self.deep_head_ch, use_gn=True)

            self.pred_cls = nn.Conv2d(self.deep_head_ch,
                                      1 * num_anchor_per_pixel, 3, 1, 1)
            self.pred_loc = nn.Conv2d(self.deep_head_ch,
                                      4 * num_anchor_per_pixel, 3, 1, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, pyramid_feature_list, dsfd_ft_list=None):
        loc = []
        conf = []

        if self.use_deep_head:
            for x in pyramid_feature_list:
                if self.use_ssh:
                    x = self.conv_SSH(x)
                x_cls = self.deep_cls_head(x)
                x_loc = self.deep_loc_head(x)

                conf.append(
                    self.pred_cls(x_cls).permute(0, 2, 3, 1).contiguous())
                loc.append(
                    self.pred_loc(x_loc).permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1, 4) for o in loc], 1)
        conf = torch.cat(
            [o.view(o.size(0), -1, self.num_classes) for o in conf], 1)
        output = (
            self.sigmoid(conf.view(conf.size(0), -1, self.num_classes)),
            loc.view(loc.size(0), -1, 4),
        )

        return output
--- a/modelscope/models/cv/face_detection/mogface/models/resnet.py
+++ b/modelscope/models/cv/face_detection/mogface/models/resnet.py
@@ -0,0 +1,193 @@
 # The implementation is modified from original resent implementaiton, which is
 #  also open-sourced by the authors as Yang Liu,
 #  and is available publicly on  https://github.com/damo-cv/MogFace

 import torch.nn as nn


 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
        bias=False,
        dilation=dilation)


 def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(
        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


 class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 downsample=None,
                 groups=1,
                 base_width=64,
                 dilation=1,
                 norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


 class ResNet(nn.Module):

    def __init__(self,
                 depth=50,
                 groups=1,
                 width_per_group=64,
                 replace_stride_with_dilation=None,
                 norm_layer=None,
                 inplanes=64,
                 shrink_ch_ratio=1):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        if depth == 50:
            block = Bottleneck
            layers = [3, 4, 6, 3]
        elif depth == 101:
            block = Bottleneck
            layers = [3, 4, 23, 3]
        elif depth == 152:
            block = Bottleneck
            layers = [3, 4, 36, 3]
        elif depth == 18:
            block = BasicBlock
            layers = [2, 2, 2, 2]
        else:
            raise ValueError('only support depth in [18, 50, 101, 152]')

        shrink_input_ch = int(inplanes * shrink_ch_ratio)
        self.inplanes = int(inplanes * shrink_ch_ratio)
        if shrink_ch_ratio == 0.125:
            layers = [2, 3, 3, 3]

        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError('replace_stride_with_dilation should be None '
                             'or a 3-element tuple, got {}'.format(
                                 replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(
            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, shrink_input_ch, layers[0])
        self.layer2 = self._make_layer(
            block,
            shrink_input_ch * 2,
            layers[1],
            stride=2,
            dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(
            block,
            shrink_input_ch * 4,
            layers[2],
            stride=2,
            dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(
            block,
            shrink_input_ch * 8,
            layers[3],
            stride=2,
            dilate=replace_stride_with_dilation[2])

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(
            block(self.inplanes, planes, stride, downsample, self.groups,
                  self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    groups=self.groups,
                    base_width=self.base_width,
                    dilation=self.dilation,
                    norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        four_conv_layer = []
        x = self.layer1(x)
        four_conv_layer.append(x)
        x = self.layer2(x)
        four_conv_layer.append(x)
        x = self.layer3(x)
        four_conv_layer.append(x)
        x = self.layer4(x)
        four_conv_layer.append(x)

        return four_conv_layer
--- a/modelscope/models/cv/face_detection/mogface/models/utils.py
+++ b/modelscope/models/cv/face_detection/mogface/models/utils.py
@@ -0,0 +1,212 @@
 # Modified from https://github.com/biubug6/Pytorch_Retinaface

 import math
 from itertools import product as product
 from math import ceil

 import numpy as np
 import torch


 def transform_anchor(anchors):
    """
    from [x0, x1, y0, y1] to [c_x, cy, w, h]
    x1 = x0 + w - 1
    c_x = (x0 + x1) / 2 = (2x0 + w - 1) / 2 = x0 + (w - 1) / 2
    """
    return np.concatenate(((anchors[:, :2] + anchors[:, 2:]) / 2,
                           anchors[:, 2:] - anchors[:, :2] + 1),
                          axis=1)


 def normalize_anchor(anchors):
    """
    from  [c_x, cy, w, h] to [x0, x1, y0, y1]
    """
    item_1 = anchors[:, :2] - (anchors[:, 2:] - 1) / 2
    item_2 = anchors[:, :2] + (anchors[:, 2:] - 1) / 2
    return np.concatenate((item_1, item_2), axis=1)


 class MogPriorBox(object):
    """
    both for fpn and single layer, single layer need to test
    return (np.array) [num_anchros, 4] [x0, y0, x1, y1]
    """

    def __init__(self,
                 scale_list=[1.],
                 aspect_ratio_list=[1.0],
                 stride_list=[4, 8, 16, 32, 64, 128],
                 anchor_size_list=[16, 32, 64, 128, 256, 512]):
        self.scale_list = scale_list
        self.aspect_ratio_list = aspect_ratio_list
        self.stride_list = stride_list
        self.anchor_size_list = anchor_size_list

    def __call__(self, img_height, img_width):
        final_anchor_list = []

        for idx, stride in enumerate(self.stride_list):
            anchor_list = []
            cur_img_height = img_height
            cur_img_width = img_width
            tmp_stride = stride

            while tmp_stride != 1:
                tmp_stride = tmp_stride // 2
                cur_img_height = (cur_img_height + 1) // 2
                cur_img_width = (cur_img_width + 1) // 2

            for i in range(cur_img_height):
                for j in range(cur_img_width):
                    for scale in self.scale_list:
                        cx = (j + 0.5) * stride
                        cy = (i + 0.5) * stride
                        side_x = self.anchor_size_list[idx] * scale
                        side_y = self.anchor_size_list[idx] * scale
                        for ratio in self.aspect_ratio_list:
                            anchor_list.append([
                                cx, cy, side_x / math.sqrt(ratio),
                                side_y * math.sqrt(ratio)
                            ])

            final_anchor_list.append(anchor_list)
        final_anchor_arr = np.concatenate(final_anchor_list, axis=0)
        normalized_anchor_arr = normalize_anchor(final_anchor_arr).astype(
            'float32')
        transformed_anchor = transform_anchor(normalized_anchor_arr)

        return transformed_anchor


 class PriorBox(object):

    def __init__(self, cfg, image_size=None, phase='train'):
        super(PriorBox, self).__init__()
        self.min_sizes = cfg['min_sizes']
        self.steps = cfg['steps']
        self.clip = cfg['clip']
        self.image_size = image_size
        self.feature_maps = [[
            ceil(self.image_size[0] / step),
            ceil(self.image_size[1] / step)
        ] for step in self.steps]
        self.name = 's'

    def forward(self):
        anchors = []
        for k, f in enumerate(self.feature_maps):
            min_sizes = self.min_sizes[k]
            for i, j in product(range(f[0]), range(f[1])):
                for min_size in min_sizes:
                    s_kx = min_size / self.image_size[1]
                    s_ky = min_size / self.image_size[0]
                    dense_cx = [
                        x * self.steps[k] / self.image_size[1]
                        for x in [j + 0.5]
                    ]
                    dense_cy = [
                        y * self.steps[k] / self.image_size[0]
                        for y in [i + 0.5]
                    ]
                    for cy, cx in product(dense_cy, dense_cx):
                        anchors += [cx, cy, s_kx, s_ky]

        # back to torch land
        output = torch.Tensor(anchors).view(-1, 4)
        if self.clip:
            output.clamp_(max=1, min=0)
        return output


 def py_cpu_nms(dets, thresh):
    """Pure Python NMS baseline."""
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep


 def mogdecode(loc, anchors):
    """
    loc: torch.Tensor
    anchors: 2-d, torch.Tensor (cx, cy, w, h)
    boxes: 2-d, torch.Tensor (x0, y0, x1, y1)
    """

    boxes = torch.cat((anchors[:, :2] + loc[:, :2] * anchors[:, 2:],
                       anchors[:, 2:] * torch.exp(loc[:, 2:])), 1)

    boxes[:, 0] -= (boxes[:, 2] - 1) / 2
    boxes[:, 1] -= (boxes[:, 3] - 1) / 2
    boxes[:, 2] += boxes[:, 0] - 1
    boxes[:, 3] += boxes[:, 1] - 1

    return boxes


 # Adapted from https://github.com/Hakuyume/chainer-ssd
 def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat(
        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes


 def decode_landm(pre, priors, variances):
    """Decode landm from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        pre (tensor): landm predictions for loc layers,
            Shape: [num_priors,10]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded landm predictions
    """
    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
    landms = torch.cat((a, b, c, d, e), dim=1)
    return landms
--- a/modelscope/models/cv/face_detection/mtcnn/init.py
+++ b/modelscope/models/cv/face_detection/mtcnn/init.py
@@ -0,0 +1 @@
 from .models.detector import MtcnnFaceDetector
--- a/modelscope/models/cv/face_detection/mtcnn/models/init.py
+++ b/modelscope/models/cv/face_detection/mtcnn/models/init.py
--- a/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
+++ b/modelscope/models/cv/face_detection/mtcnn/models/box_utils.py
@@ -0,0 +1,240 @@
 # The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
 import numpy as np
 from PIL import Image


 def nms(boxes, overlap_threshold=0.5, mode='union'):
    """Non-maximum suppression.

    Arguments:
        boxes: a float numpy array of shape [n, 5],
            where each row is (xmin, ymin, xmax, ymax, score).
        overlap_threshold: a float number.
        mode: 'union' or 'min'.

    Returns:
        list with indices of the selected boxes
    """

    # if there are no boxes, return the empty list
    if len(boxes) == 0:
        return []

    # list of picked indices
    pick = []

    # grab the coordinates of the bounding boxes
    x1, y1, x2, y2, score = [boxes[:, i] for i in range(5)]

    area = (x2 - x1 + 1.0) * (y2 - y1 + 1.0)
    ids = np.argsort(score)  # in increasing order

    while len(ids) > 0:

        # grab index of the largest value
        last = len(ids) - 1
        i = ids[last]
        pick.append(i)

        # compute intersections
        # of the box with the largest score
        # with the rest of boxes

        # left top corner of intersection boxes
        ix1 = np.maximum(x1[i], x1[ids[:last]])
        iy1 = np.maximum(y1[i], y1[ids[:last]])

        # right bottom corner of intersection boxes
        ix2 = np.minimum(x2[i], x2[ids[:last]])
        iy2 = np.minimum(y2[i], y2[ids[:last]])

        # width and height of intersection boxes
        w = np.maximum(0.0, ix2 - ix1 + 1.0)
        h = np.maximum(0.0, iy2 - iy1 + 1.0)

        # intersections' areas
        inter = w * h
        if mode == 'min':
            overlap = inter / np.minimum(area[i], area[ids[:last]])
        elif mode == 'union':
            # intersection over union (IoU)
            overlap = inter / (area[i] + area[ids[:last]] - inter)

        # delete all boxes where overlap is too big
        ids = np.delete(
            ids,
            np.concatenate([[last],
                            np.where(overlap > overlap_threshold)[0]]))

    return pick


 def convert_to_square(bboxes):
    """Convert bounding boxes to a square form.

    Arguments:
        bboxes: a float numpy array of shape [n, 5].

    Returns:
        a float numpy array of shape [n, 5],
            squared bounding boxes.
    """

    square_bboxes = np.zeros_like(bboxes)
    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
    h = y2 - y1 + 1.0
    w = x2 - x1 + 1.0
    max_side = np.maximum(h, w)
    square_bboxes[:, 0] = x1 + w * 0.5 - max_side * 0.5
    square_bboxes[:, 1] = y1 + h * 0.5 - max_side * 0.5
    square_bboxes[:, 2] = square_bboxes[:, 0] + max_side - 1.0
    square_bboxes[:, 3] = square_bboxes[:, 1] + max_side - 1.0
    return square_bboxes


 def calibrate_box(bboxes, offsets):
    """Transform bounding boxes to be more like true bounding boxes.
    'offsets' is one of the outputs of the nets.

    Arguments:
        bboxes: a float numpy array of shape [n, 5].
        offsets: a float numpy array of shape [n, 4].

    Returns:
        a float numpy array of shape [n, 5].
    """
    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
    w = x2 - x1 + 1.0
    h = y2 - y1 + 1.0
    w = np.expand_dims(w, 1)
    h = np.expand_dims(h, 1)

    # this is what happening here:
    # tx1, ty1, tx2, ty2 = [offsets[:, i] for i in range(4)]
    # x1_true = x1 + tx1*w
    # y1_true = y1 + ty1*h
    # x2_true = x2 + tx2*w
    # y2_true = y2 + ty2*h
    # below is just more compact form of this

    # are offsets always such that
    # x1 < x2 and y1 < y2 ?

    translation = np.hstack([w, h, w, h]) * offsets
    bboxes[:, 0:4] = bboxes[:, 0:4] + translation
    return bboxes


 def get_image_boxes(bounding_boxes, img, size=24):
    """Cut out boxes from the image.

    Arguments:
        bounding_boxes: a float numpy array of shape [n, 5].
        img: an instance of PIL.Image.
        size: an integer, size of cutouts.

    Returns:
        a float numpy array of shape [n, 3, size, size].
    """

    num_boxes = len(bounding_boxes)
    width, height = img.size

    [dy, edy, dx, edx, y, ey, x, ex, w,
     h] = correct_bboxes(bounding_boxes, width, height)
    img_boxes = np.zeros((num_boxes, 3, size, size), 'float32')

    for i in range(num_boxes):
        img_box = np.zeros((h[i], w[i], 3), 'uint8')

        img_array = np.asarray(img, 'uint8')
        img_box[dy[i]:(edy[i] + 1), dx[i]:(edx[i] + 1), :] =\
            img_array[y[i]:(ey[i] + 1), x[i]:(ex[i] + 1), :]

        # resize
        img_box = Image.fromarray(img_box)
        img_box = img_box.resize((size, size), Image.BILINEAR)
        img_box = np.asarray(img_box, 'float32')

        img_boxes[i, :, :, :] = _preprocess(img_box)

    return img_boxes


 def correct_bboxes(bboxes, width, height):
    """Crop boxes that are too big and get coordinates
    with respect to cutouts.

    Arguments:
        bboxes: a float numpy array of shape [n, 5],
            where each row is (xmin, ymin, xmax, ymax, score).
        width: a float number.
        height: a float number.

    Returns:
        dy, dx, edy, edx: a int numpy arrays of shape [n],
            coordinates of the boxes with respect to the cutouts.
        y, x, ey, ex: a int numpy arrays of shape [n],
            corrected ymin, xmin, ymax, xmax.
        h, w: a int numpy arrays of shape [n],
            just heights and widths of boxes.

        in the following order:
            [dy, edy, dx, edx, y, ey, x, ex, w, h].
    """

    x1, y1, x2, y2 = [bboxes[:, i] for i in range(4)]
    w, h = x2 - x1 + 1.0, y2 - y1 + 1.0
    num_boxes = bboxes.shape[0]

    # 'e' stands for end
    # (x, y) -> (ex, ey)
    x, y, ex, ey = x1, y1, x2, y2

    # we need to cut out a box from the image.
    # (x, y, ex, ey) are corrected coordinates of the box
    # in the image.
    # (dx, dy, edx, edy) are coordinates of the box in the cutout
    # from the image.
    dx, dy = np.zeros((num_boxes, )), np.zeros((num_boxes, ))
    edx, edy = w.copy() - 1.0, h.copy() - 1.0

    # if box's bottom right corner is too far right
    ind = np.where(ex > width - 1.0)[0]
    edx[ind] = w[ind] + width - 2.0 - ex[ind]
    ex[ind] = width - 1.0

    # if box's bottom right corner is too low
    ind = np.where(ey > height - 1.0)[0]
    edy[ind] = h[ind] + height - 2.0 - ey[ind]
    ey[ind] = height - 1.0

    # if box's top left corner is too far left
    ind = np.where(x < 0.0)[0]
    dx[ind] = 0.0 - x[ind]
    x[ind] = 0.0

    # if box's top left corner is too high
    ind = np.where(y < 0.0)[0]
    dy[ind] = 0.0 - y[ind]
    y[ind] = 0.0

    return_list = [dy, edy, dx, edx, y, ey, x, ex, w, h]
    return_list = [i.astype('int32') for i in return_list]

    return return_list


 def _preprocess(img):
    """Preprocessing step before feeding the network.

    Arguments:
        img: a float numpy array of shape [h, w, c].

    Returns:
        a float numpy array of shape [1, c, h, w].
    """
    img = img.transpose((2, 0, 1))
    img = np.expand_dims(img, 0)
    img = (img - 127.5) * 0.0078125
    return img
--- a/modelscope/models/cv/face_detection/mtcnn/models/detector.py
+++ b/modelscope/models/cv/face_detection/mtcnn/models/detector.py
@@ -0,0 +1,149 @@
 # The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
 import os

 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 from PIL import Image
 from torch.autograd import Variable

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 from .box_utils import calibrate_box, convert_to_square, get_image_boxes, nms
 from .first_stage import run_first_stage
 from .get_nets import ONet, PNet, RNet


@MODELS.register_module(Tasks.face_detection, module_name=Models.mtcnn)
 class MtcnnFaceDetector(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device

        self.pnet = PNet(model_path=os.path.join(self.model_path, 'pnet.npy'))
        self.rnet = RNet(model_path=os.path.join(self.model_path, 'rnet.npy'))
        self.onet = ONet(model_path=os.path.join(self.model_path, 'onet.npy'))

        self.pnet = self.pnet.to(device)
        self.rnet = self.rnet.to(device)
        self.onet = self.onet.to(device)

    def forward(self, input):
        image = Image.fromarray(np.uint8(input['img'].cpu().numpy()))
        pnet = self.pnet
        rnet = self.rnet
        onet = self.onet
        onet.eval()

        min_face_size = 20.0
        thresholds = [0.7, 0.8, 0.9]
        nms_thresholds = [0.7, 0.7, 0.7]

        # BUILD AN IMAGE PYRAMID
        width, height = image.size
        min_length = min(height, width)

        min_detection_size = 12
        factor = 0.707  # sqrt(0.5)

        # scales for scaling the image
        scales = []

        m = min_detection_size / min_face_size
        min_length *= m

        factor_count = 0
        while min_length > min_detection_size:
            scales.append(m * factor**factor_count)
            min_length *= factor
            factor_count += 1

        # STAGE 1

        # it will be returned
        bounding_boxes = []

        # run P-Net on different scales
        for s in scales:
            boxes = run_first_stage(
                image,
                pnet,
                scale=s,
                threshold=thresholds[0],
                device=self.device)
            bounding_boxes.append(boxes)

        # collect boxes (and offsets, and scores) from different scales
        bounding_boxes = [i for i in bounding_boxes if i is not None]
        bounding_boxes = np.vstack(bounding_boxes)

        keep = nms(bounding_boxes[:, 0:5], nms_thresholds[0])
        bounding_boxes = bounding_boxes[keep]

        # use offsets predicted by pnet to transform bounding boxes
        bounding_boxes = calibrate_box(bounding_boxes[:, 0:5],
                                       bounding_boxes[:, 5:])
        # shape [n_boxes, 5]

        bounding_boxes = convert_to_square(bounding_boxes)
        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])

        # STAGE 2

        img_boxes = get_image_boxes(bounding_boxes, image, size=24)
        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
        output = rnet(img_boxes.to(self.device))
        offsets = output[0].cpu().data.numpy()  # shape [n_boxes, 4]
        probs = output[1].cpu().data.numpy()  # shape [n_boxes, 2]

        keep = np.where(probs[:, 1] > thresholds[1])[0]
        bounding_boxes = bounding_boxes[keep]
        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
        offsets = offsets[keep]

        keep = nms(bounding_boxes, nms_thresholds[1])
        bounding_boxes = bounding_boxes[keep]
        bounding_boxes = calibrate_box(bounding_boxes, offsets[keep])
        bounding_boxes = convert_to_square(bounding_boxes)
        bounding_boxes[:, 0:4] = np.round(bounding_boxes[:, 0:4])

        # STAGE 3

        img_boxes = get_image_boxes(bounding_boxes, image, size=48)
        if len(img_boxes) == 0:
            return [], []
        img_boxes = Variable(torch.FloatTensor(img_boxes), volatile=True)
        output = onet(img_boxes.to(self.device))
        landmarks = output[0].cpu().data.numpy()  # shape [n_boxes, 10]
        offsets = output[1].cpu().data.numpy()  # shape [n_boxes, 4]
        probs = output[2].cpu().data.numpy()  # shape [n_boxes, 2]

        keep = np.where(probs[:, 1] > thresholds[2])[0]
        bounding_boxes = bounding_boxes[keep]
        bounding_boxes[:, 4] = probs[keep, 1].reshape((-1, ))
        offsets = offsets[keep]
        landmarks = landmarks[keep]

        # compute landmark points
        width = bounding_boxes[:, 2] - bounding_boxes[:, 0] + 1.0
        height = bounding_boxes[:, 3] - bounding_boxes[:, 1] + 1.0
        xmin, ymin = bounding_boxes[:, 0], bounding_boxes[:, 1]
        landmarks[:, 0:5] = np.expand_dims(
            xmin, 1) + np.expand_dims(width, 1) * landmarks[:, 0:5]
        landmarks[:, 5:10] = np.expand_dims(
            ymin, 1) + np.expand_dims(height, 1) * landmarks[:, 5:10]

        bounding_boxes = calibrate_box(bounding_boxes, offsets)
        keep = nms(bounding_boxes, nms_thresholds[2], mode='min')
        bounding_boxes = bounding_boxes[keep]
        landmarks = landmarks[keep]
        landmarks = landmarks.reshape(-1, 2, 5).transpose(
            (0, 2, 1)).reshape(-1, 10)

        return bounding_boxes, landmarks
--- a/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
+++ b/modelscope/models/cv/face_detection/mtcnn/models/first_stage.py
@@ -0,0 +1,100 @@
 # The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
 import math

 import numpy as np
 import torch
 from PIL import Image
 from torch.autograd import Variable

 from .box_utils import _preprocess, nms


 def run_first_stage(image, net, scale, threshold, device='cuda'):
    """Run P-Net, generate bounding boxes, and do NMS.

    Arguments:
        image: an instance of PIL.Image.
        net: an instance of pytorch's nn.Module, P-Net.
        scale: a float number,
            scale width and height of the image by this number.
        threshold: a float number,
            threshold on the probability of a face when generating
            bounding boxes from predictions of the net.

    Returns:
        a float numpy array of shape [n_boxes, 9],
            bounding boxes with scores and offsets (4 + 1 + 4).
    """

    # scale the image and convert it to a float array
    width, height = image.size
    sw, sh = math.ceil(width * scale), math.ceil(height * scale)
    img = image.resize((sw, sh), Image.BILINEAR)
    img = np.asarray(img, 'float32')

    img = Variable(
        torch.FloatTensor(_preprocess(img)), volatile=True).to(device)
    output = net(img)
    probs = output[1].cpu().data.numpy()[0, 1, :, :]
    offsets = output[0].cpu().data.numpy()
    # probs: probability of a face at each sliding window
    # offsets: transformations to true bounding boxes

    boxes = _generate_bboxes(probs, offsets, scale, threshold)
    if len(boxes) == 0:
        return None

    keep = nms(boxes[:, 0:5], overlap_threshold=0.5)
    return boxes[keep]


 def _generate_bboxes(probs, offsets, scale, threshold):
    """Generate bounding boxes at places
    where there is probably a face.

    Arguments:
        probs: a float numpy array of shape [n, m].
        offsets: a float numpy array of shape [1, 4, n, m].
        scale: a float number,
            width and height of the image were scaled by this number.
        threshold: a float number.

    Returns:
        a float numpy array of shape [n_boxes, 9]
    """

    # applying P-Net is equivalent, in some sense, to
    # moving 12x12 window with stride 2
    stride = 2
    cell_size = 12

    # indices of boxes where there is probably a face
    inds = np.where(probs > threshold)

    if inds[0].size == 0:
        return np.array([])

    # transformations of bounding boxes
    tx1, ty1, tx2, ty2 = [offsets[0, i, inds[0], inds[1]] for i in range(4)]
    # they are defined as:
    # w = x2 - x1 + 1
    # h = y2 - y1 + 1
    # x1_true = x1 + tx1*w
    # x2_true = x2 + tx2*w
    # y1_true = y1 + ty1*h
    # y2_true = y2 + ty2*h

    offsets = np.array([tx1, ty1, tx2, ty2])
    score = probs[inds[0], inds[1]]

    # P-Net is applied to scaled images
    # so we need to rescale bounding boxes back
    bounding_boxes = np.vstack([
        np.round((stride * inds[1] + 1.0) / scale),
        np.round((stride * inds[0] + 1.0) / scale),
        np.round((stride * inds[1] + 1.0 + cell_size) / scale),
        np.round((stride * inds[0] + 1.0 + cell_size) / scale), score, offsets
    ])
    # why one is added?

    return bounding_boxes.T
--- a/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
+++ b/modelscope/models/cv/face_detection/mtcnn/models/get_nets.py
@@ -0,0 +1,160 @@
 # The implementation is based on mtcnn, available at https://github.com/TropComplique/mtcnn-pytorch
 from collections import OrderedDict

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class Flatten(nn.Module):

    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, x):
        """
        Arguments:
            x: a float tensor with shape [batch_size, c, h, w].
        Returns:
            a float tensor with shape [batch_size, c*h*w].
        """

        # without this pretrained model isn't working
        x = x.transpose(3, 2).contiguous()

        return x.view(x.size(0), -1)


 class PNet(nn.Module):

    def __init__(self, model_path=None):

        super(PNet, self).__init__()

        # suppose we have input with size HxW, then
        # after first layer: H - 2,
        # after pool: ceil((H - 2)/2),
        # after second conv: ceil((H - 2)/2) - 2,
        # after last conv: ceil((H - 2)/2) - 4,
        # and the same for W

        self.features = nn.Sequential(
            OrderedDict([('conv1', nn.Conv2d(3, 10, 3, 1)),
                         ('prelu1', nn.PReLU(10)),
                         ('pool1', nn.MaxPool2d(2, 2, ceil_mode=True)),
                         ('conv2', nn.Conv2d(10, 16, 3, 1)),
                         ('prelu2', nn.PReLU(16)),
                         ('conv3', nn.Conv2d(16, 32, 3, 1)),
                         ('prelu3', nn.PReLU(32))]))

        self.conv4_1 = nn.Conv2d(32, 2, 1, 1)
        self.conv4_2 = nn.Conv2d(32, 4, 1, 1)

        weights = np.load(model_path, allow_pickle=True)[()]
        for n, p in self.named_parameters():
            p.data = torch.FloatTensor(weights[n])

    def forward(self, x):
        """
        Arguments:
            x: a float tensor with shape [batch_size, 3, h, w].
        Returns:
            b: a float tensor with shape [batch_size, 4, h', w'].
            a: a float tensor with shape [batch_size, 2, h', w'].
        """
        x = self.features(x)
        a = self.conv4_1(x)
        b = self.conv4_2(x)
        a = F.softmax(a)
        return b, a


 class RNet(nn.Module):

    def __init__(self, model_path=None):

        super(RNet, self).__init__()

        self.features = nn.Sequential(
            OrderedDict([('conv1', nn.Conv2d(3, 28, 3, 1)),
                         ('prelu1', nn.PReLU(28)),
                         ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
                         ('conv2', nn.Conv2d(28, 48, 3, 1)),
                         ('prelu2', nn.PReLU(48)),
                         ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
                         ('conv3', nn.Conv2d(48, 64, 2, 1)),
                         ('prelu3', nn.PReLU(64)), ('flatten', Flatten()),
                         ('conv4', nn.Linear(576, 128)),
                         ('prelu4', nn.PReLU(128))]))

        self.conv5_1 = nn.Linear(128, 2)
        self.conv5_2 = nn.Linear(128, 4)

        weights = np.load(model_path, allow_pickle=True)[()]
        for n, p in self.named_parameters():
            p.data = torch.FloatTensor(weights[n])

    def forward(self, x):
        """
        Arguments:
            x: a float tensor with shape [batch_size, 3, h, w].
        Returns:
            b: a float tensor with shape [batch_size, 4].
            a: a float tensor with shape [batch_size, 2].
        """
        x = self.features(x)
        a = self.conv5_1(x)
        b = self.conv5_2(x)
        a = F.softmax(a)
        return b, a


 class ONet(nn.Module):

    def __init__(self, model_path=None):

        super(ONet, self).__init__()

        self.features = nn.Sequential(
            OrderedDict([
                ('conv1', nn.Conv2d(3, 32, 3, 1)),
                ('prelu1', nn.PReLU(32)),
                ('pool1', nn.MaxPool2d(3, 2, ceil_mode=True)),
                ('conv2', nn.Conv2d(32, 64, 3, 1)),
                ('prelu2', nn.PReLU(64)),
                ('pool2', nn.MaxPool2d(3, 2, ceil_mode=True)),
                ('conv3', nn.Conv2d(64, 64, 3, 1)),
                ('prelu3', nn.PReLU(64)),
                ('pool3', nn.MaxPool2d(2, 2, ceil_mode=True)),
                ('conv4', nn.Conv2d(64, 128, 2, 1)),
                ('prelu4', nn.PReLU(128)),
                ('flatten', Flatten()),
                ('conv5', nn.Linear(1152, 256)),
                ('drop5', nn.Dropout(0.25)),
                ('prelu5', nn.PReLU(256)),
            ]))

        self.conv6_1 = nn.Linear(256, 2)
        self.conv6_2 = nn.Linear(256, 4)
        self.conv6_3 = nn.Linear(256, 10)

        weights = np.load(model_path, allow_pickle=True)[()]
        for n, p in self.named_parameters():
            p.data = torch.FloatTensor(weights[n])

    def forward(self, x):
        """
        Arguments:
            x: a float tensor with shape [batch_size, 3, h, w].
        Returns:
            c: a float tensor with shape [batch_size, 10].
            b: a float tensor with shape [batch_size, 4].
            a: a float tensor with shape [batch_size, 2].
        """
        x = self.features(x)
        a = self.conv6_1(x)
        b = self.conv6_2(x)
        c = self.conv6_3(x)
        a = F.softmax(a)
        return c, b, a
--- a/modelscope/models/cv/face_detection/ulfd_slim/init.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/init.py
@@ -0,0 +1 @@
 from .detection import UlfdFaceDetector
--- a/modelscope/models/cv/face_detection/ulfd_slim/detection.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/detection.py
@@ -0,0 +1,44 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 import os

 import cv2
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 import torch.nn.functional as F

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from .vision.ssd.fd_config import define_img_size
 from .vision.ssd.mb_tiny_fd import (create_mb_tiny_fd,
                                    create_mb_tiny_fd_predictor)

 define_img_size(640)


@MODELS.register_module(Tasks.face_detection, module_name=Models.ulfd)
 class UlfdFaceDetector(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
        self.net = create_mb_tiny_fd(2, is_test=True, device=device)
        self.predictor = create_mb_tiny_fd_predictor(
            self.net, candidate_size=1500, device=device)
        self.net.load(model_path)
        self.net = self.net.to(device)

    def forward(self, input):
        img_raw = input['img']
        img = np.array(img_raw.cpu().detach())
        img = img[:, :, ::-1]
        prob_th = 0.85
        keep_top_k = 750
        boxes, labels, probs = self.predictor.predict(img, keep_top_k, prob_th)
        return boxes, probs
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/init.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/init.py
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/box_utils.py
@@ -0,0 +1,124 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 import math

 import torch


 def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
    """

    Args:
        box_scores (N, 5): boxes in corner-form and probabilities.
        iou_threshold: intersection over union threshold.
        top_k: keep top_k results. If k <= 0, keep all the results.
        candidate_size: only consider the candidates with the highest scores.
    Returns:
         picked: a list of indexes of the kept boxes
    """
    scores = box_scores[:, -1]
    boxes = box_scores[:, :-1]
    picked = []
    _, indexes = scores.sort(descending=True)
    indexes = indexes[:candidate_size]
    while len(indexes) > 0:
        current = indexes[0]
        picked.append(current.item())
        if 0 < top_k == len(picked) or len(indexes) == 1:
            break
        current_box = boxes[current, :]
        indexes = indexes[1:]
        rest_boxes = boxes[indexes, :]
        iou = iou_of(
            rest_boxes,
            current_box.unsqueeze(0),
        )
        indexes = indexes[iou <= iou_threshold]

    return box_scores[picked, :]


 def nms(box_scores,
        nms_method=None,
        score_threshold=None,
        iou_threshold=None,
        sigma=0.5,
        top_k=-1,
        candidate_size=200):
    return hard_nms(
        box_scores, iou_threshold, top_k, candidate_size=candidate_size)


 def generate_priors(feature_map_list,
                    shrinkage_list,
                    image_size,
                    min_boxes,
                    clamp=True) -> torch.Tensor:
    priors = []
    for index in range(0, len(feature_map_list[0])):
        scale_w = image_size[0] / shrinkage_list[0][index]
        scale_h = image_size[1] / shrinkage_list[1][index]
        for j in range(0, feature_map_list[1][index]):
            for i in range(0, feature_map_list[0][index]):
                x_center = (i + 0.5) / scale_w
                y_center = (j + 0.5) / scale_h

                for min_box in min_boxes[index]:
                    w = min_box / image_size[0]
                    h = min_box / image_size[1]
                    priors.append([x_center, y_center, w, h])
    priors = torch.tensor(priors)
    if clamp:
        torch.clamp(priors, 0.0, 1.0, out=priors)
    return priors


 def convert_locations_to_boxes(locations, priors, center_variance,
                               size_variance):
    # priors can have one dimension less.
    if priors.dim() + 1 == locations.dim():
        priors = priors.unsqueeze(0)
    a = locations[..., :2] * center_variance * priors[...,
                                                      2:] + priors[..., :2]
    b = torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]

    return torch.cat([a, b], dim=locations.dim() - 1)


 def center_form_to_corner_form(locations):
    a = locations[..., :2] - locations[..., 2:] / 2
    b = locations[..., :2] + locations[..., 2:] / 2
    return torch.cat([a, b], locations.dim() - 1)


 def iou_of(boxes0, boxes1, eps=1e-5):
    """Return intersection-over-union (Jaccard index) of boxes.

    Args:
        boxes0 (N, 4): ground truth boxes.
        boxes1 (N or 1, 4): predicted boxes.
        eps: a small number to avoid 0 as denominator.
    Returns:
        iou (N): IoU values.
    """
    overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
    overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])

    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
    return overlap_area / (area0 + area1 - overlap_area + eps)


 def area_of(left_top, right_bottom) -> torch.Tensor:
    """Compute the areas of rectangles given two corners.

    Args:
        left_top (N, 2): left top corner.
        right_bottom (N, 2): right bottom corner.

    Returns:
        area (N): return the area.
    """
    hw = torch.clamp(right_bottom - left_top, min=0.0)
    return hw[..., 0] * hw[..., 1]
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/mb_tiny.py
@@ -0,0 +1,49 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 import torch.nn as nn
 import torch.nn.functional as F


 class Mb_Tiny(nn.Module):

    def __init__(self, num_classes=2):
        super(Mb_Tiny, self).__init__()
        self.base_channel = 8 * 2

        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup), nn.ReLU(inplace=True))

        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.ReLU(inplace=True),
                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU(inplace=True),
            )

        self.model = nn.Sequential(
            conv_bn(3, self.base_channel, 2),  # 160*120
            conv_dw(self.base_channel, self.base_channel * 2, 1),
            conv_dw(self.base_channel * 2, self.base_channel * 2, 2),  # 80*60
            conv_dw(self.base_channel * 2, self.base_channel * 2, 1),
            conv_dw(self.base_channel * 2, self.base_channel * 4, 2),  # 40*30
            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
            conv_dw(self.base_channel * 4, self.base_channel * 4, 1),
            conv_dw(self.base_channel * 4, self.base_channel * 8, 2),  # 20*15
            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
            conv_dw(self.base_channel * 8, self.base_channel * 8, 1),
            conv_dw(self.base_channel * 8, self.base_channel * 16, 2),  # 10*8
            conv_dw(self.base_channel * 16, self.base_channel * 16, 1))
        self.fc = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.model(x)
        x = F.avg_pool2d(x, 7)
        x = x.view(-1, 1024)
        x = self.fc(x)
        return x
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/init.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/init.py
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/data_preprocessing.py
@@ -0,0 +1,18 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 from ..transforms import Compose, Resize, SubtractMeans, ToTensor


 class PredictionTransform:

    def __init__(self, size, mean=0.0, std=1.0):
        self.transform = Compose([
            Resize(size),
            SubtractMeans(mean), lambda img, boxes=None, labels=None:
            (img / std, boxes, labels),
            ToTensor()
        ])

    def __call__(self, image):
        image, _, _ = self.transform(image)
        return image
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/fd_config.py
@@ -0,0 +1,49 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 import numpy as np

 from ..box_utils import generate_priors

 image_mean_test = image_mean = np.array([127, 127, 127])
 image_std = 128.0
 iou_threshold = 0.3
 center_variance = 0.1
 size_variance = 0.2

 min_boxes = [[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]]
 shrinkage_list = []
 image_size = [320, 240]  # default input size 320*240
 feature_map_w_h_list = [[40, 20, 10, 5], [30, 15, 8,
                                          4]]  # default feature map size
 priors = []


 def define_img_size(size):
    global image_size, feature_map_w_h_list, priors
    img_size_dict = {
        128: [128, 96],
        160: [160, 120],
        320: [320, 240],
        480: [480, 360],
        640: [640, 480],
        1280: [1280, 960]
    }
    image_size = img_size_dict[size]

    feature_map_w_h_list_dict = {
        128: [[16, 8, 4, 2], [12, 6, 3, 2]],
        160: [[20, 10, 5, 3], [15, 8, 4, 2]],
        320: [[40, 20, 10, 5], [30, 15, 8, 4]],
        480: [[60, 30, 15, 8], [45, 23, 12, 6]],
        640: [[80, 40, 20, 10], [60, 30, 15, 8]],
        1280: [[160, 80, 40, 20], [120, 60, 30, 15]]
    }
    feature_map_w_h_list = feature_map_w_h_list_dict[size]

    for i in range(0, len(image_size)):
        item_list = []
        for k in range(0, len(feature_map_w_h_list[i])):
            item_list.append(image_size[i] / feature_map_w_h_list[i][k])
        shrinkage_list.append(item_list)
    priors = generate_priors(feature_map_w_h_list, shrinkage_list, image_size,
                             min_boxes)
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/mb_tiny_fd.py
@@ -0,0 +1,124 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 from torch.nn import Conv2d, ModuleList, ReLU, Sequential

 from ..mb_tiny import Mb_Tiny
 from . import fd_config as config
 from .predictor import Predictor
 from .ssd import SSD


 def SeperableConv2d(in_channels,
                    out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0):
    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
    """
    return Sequential(
        Conv2d(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=kernel_size,
            groups=in_channels,
            stride=stride,
            padding=padding),
        ReLU(),
        Conv2d(
            in_channels=in_channels, out_channels=out_channels, kernel_size=1),
    )


 def create_mb_tiny_fd(num_classes, is_test=False, device='cuda'):
    base_net = Mb_Tiny(2)
    base_net_model = base_net.model  # disable dropout layer

    source_layer_indexes = [8, 11, 13]
    extras = ModuleList([
        Sequential(
            Conv2d(
                in_channels=base_net.base_channel * 16,
                out_channels=base_net.base_channel * 4,
                kernel_size=1), ReLU(),
            SeperableConv2d(
                in_channels=base_net.base_channel * 4,
                out_channels=base_net.base_channel * 16,
                kernel_size=3,
                stride=2,
                padding=1), ReLU())
    ])

    regression_headers = ModuleList([
        SeperableConv2d(
            in_channels=base_net.base_channel * 4,
            out_channels=3 * 4,
            kernel_size=3,
            padding=1),
        SeperableConv2d(
            in_channels=base_net.base_channel * 8,
            out_channels=2 * 4,
            kernel_size=3,
            padding=1),
        SeperableConv2d(
            in_channels=base_net.base_channel * 16,
            out_channels=2 * 4,
            kernel_size=3,
            padding=1),
        Conv2d(
            in_channels=base_net.base_channel * 16,
            out_channels=3 * 4,
            kernel_size=3,
            padding=1)
    ])

    classification_headers = ModuleList([
        SeperableConv2d(
            in_channels=base_net.base_channel * 4,
            out_channels=3 * num_classes,
            kernel_size=3,
            padding=1),
        SeperableConv2d(
            in_channels=base_net.base_channel * 8,
            out_channels=2 * num_classes,
            kernel_size=3,
            padding=1),
        SeperableConv2d(
            in_channels=base_net.base_channel * 16,
            out_channels=2 * num_classes,
            kernel_size=3,
            padding=1),
        Conv2d(
            in_channels=base_net.base_channel * 16,
            out_channels=3 * num_classes,
            kernel_size=3,
            padding=1)
    ])

    return SSD(
        num_classes,
        base_net_model,
        source_layer_indexes,
        extras,
        classification_headers,
        regression_headers,
        is_test=is_test,
        config=config,
        device=device)


 def create_mb_tiny_fd_predictor(net,
                                candidate_size=200,
                                nms_method=None,
                                sigma=0.5,
                                device=None):
    predictor = Predictor(
        net,
        config.image_size,
        config.image_mean_test,
        config.image_std,
        nms_method=nms_method,
        iou_threshold=config.iou_threshold,
        candidate_size=candidate_size,
        sigma=sigma,
        device=device)
    return predictor
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/predictor.py
@@ -0,0 +1,80 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 import torch

 from .. import box_utils
 from .data_preprocessing import PredictionTransform


 class Predictor:

    def __init__(self,
                 net,
                 size,
                 mean=0.0,
                 std=1.0,
                 nms_method=None,
                 iou_threshold=0.3,
                 filter_threshold=0.85,
                 candidate_size=200,
                 sigma=0.5,
                 device=None):
        self.net = net
        self.transform = PredictionTransform(size, mean, std)
        self.iou_threshold = iou_threshold
        self.filter_threshold = filter_threshold
        self.candidate_size = candidate_size
        self.nms_method = nms_method

        self.sigma = sigma
        if device:
            self.device = device
        else:
            self.device = torch.device(
                'cuda:0' if torch.cuda.is_available() else 'cpu')

        self.net.to(self.device)
        self.net.eval()

    def predict(self, image, top_k=-1, prob_threshold=None):
        height, width, _ = image.shape
        image = self.transform(image)
        images = image.unsqueeze(0)
        images = images.to(self.device)
        with torch.no_grad():
            for i in range(1):
                scores, boxes = self.net.forward(images)
        boxes = boxes[0]
        scores = scores[0]
        if not prob_threshold:
            prob_threshold = self.filter_threshold
        # this version of nms is slower on GPU, so we move data to CPU.
        picked_box_probs = []
        picked_labels = []
        for class_index in range(1, scores.size(1)):
            probs = scores[:, class_index]
            mask = probs > prob_threshold
            probs = probs[mask]
            if probs.size(0) == 0:
                continue
            subset_boxes = boxes[mask, :]
            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
            box_probs = box_utils.nms(
                box_probs,
                self.nms_method,
                score_threshold=prob_threshold,
                iou_threshold=self.iou_threshold,
                sigma=self.sigma,
                top_k=top_k,
                candidate_size=self.candidate_size)
            picked_box_probs.append(box_probs)
            picked_labels.extend([class_index] * box_probs.size(0))
        if not picked_box_probs:
            return torch.tensor([]), torch.tensor([]), torch.tensor([])
        picked_box_probs = torch.cat(picked_box_probs)
        picked_box_probs[:, 0] *= width
        picked_box_probs[:, 1] *= height
        picked_box_probs[:, 2] *= width
        picked_box_probs[:, 3] *= height
        return picked_box_probs[:, :4], torch.tensor(
            picked_labels), picked_box_probs[:, 4]
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/ssd/ssd.py
@@ -0,0 +1,129 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 from collections import namedtuple
 from typing import List, Tuple

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .. import box_utils

 GraphPath = namedtuple('GraphPath', ['s0', 'name', 's1'])


 class SSD(nn.Module):

    def __init__(self,
                 num_classes: int,
                 base_net: nn.ModuleList,
                 source_layer_indexes: List[int],
                 extras: nn.ModuleList,
                 classification_headers: nn.ModuleList,
                 regression_headers: nn.ModuleList,
                 is_test=False,
                 config=None,
                 device=None):
        """Compose a SSD model using the given components.
        """
        super(SSD, self).__init__()

        self.num_classes = num_classes
        self.base_net = base_net
        self.source_layer_indexes = source_layer_indexes
        self.extras = extras
        self.classification_headers = classification_headers
        self.regression_headers = regression_headers
        self.is_test = is_test
        self.config = config

        # register layers in source_layer_indexes by adding them to a module list
        self.source_layer_add_ons = nn.ModuleList([
            t[1] for t in source_layer_indexes
            if isinstance(t, tuple) and not isinstance(t, GraphPath)
        ])
        if device:
            self.device = device
        else:
            self.device = torch.device(
                'cuda:0' if torch.cuda.is_available() else 'cpu')
        if is_test:
            self.config = config
            self.priors = config.priors.to(self.device)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        confidences = []
        locations = []
        start_layer_index = 0
        header_index = 0
        end_layer_index = 0
        for end_layer_index in self.source_layer_indexes:
            if isinstance(end_layer_index, GraphPath):
                path = end_layer_index
                end_layer_index = end_layer_index.s0
                added_layer = None
            elif isinstance(end_layer_index, tuple):
                added_layer = end_layer_index[1]
                end_layer_index = end_layer_index[0]
                path = None
            else:
                added_layer = None
                path = None
            for layer in self.base_net[start_layer_index:end_layer_index]:
                x = layer(x)
            if added_layer:
                y = added_layer(x)
            else:
                y = x
            if path:
                sub = getattr(self.base_net[end_layer_index], path.name)
                for layer in sub[:path.s1]:
                    x = layer(x)
                y = x
                for layer in sub[path.s1:]:
                    x = layer(x)
                end_layer_index += 1
            start_layer_index = end_layer_index
            confidence, location = self.compute_header(header_index, y)
            header_index += 1
            confidences.append(confidence)
            locations.append(location)

        for layer in self.base_net[end_layer_index:]:
            x = layer(x)

        for layer in self.extras:
            x = layer(x)
            confidence, location = self.compute_header(header_index, x)
            header_index += 1
            confidences.append(confidence)
            locations.append(location)

        confidences = torch.cat(confidences, 1)
        locations = torch.cat(locations, 1)

        if self.is_test:
            confidences = F.softmax(confidences, dim=2)
            boxes = box_utils.convert_locations_to_boxes(
                locations, self.priors, self.config.center_variance,
                self.config.size_variance)
            boxes = box_utils.center_form_to_corner_form(boxes)
            return confidences, boxes
        else:
            return confidences, locations

    def compute_header(self, i, x):
        confidence = self.classification_headers[i](x)
        confidence = confidence.permute(0, 2, 3, 1).contiguous()
        confidence = confidence.view(confidence.size(0), -1, self.num_classes)

        location = self.regression_headers[i](x)
        location = location.permute(0, 2, 3, 1).contiguous()
        location = location.view(location.size(0), -1, 4)

        return confidence, location

    def load(self, model):
        self.load_state_dict(
            torch.load(model, map_location=lambda storage, loc: storage))
--- a/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/vision/transforms.py
@@ -0,0 +1,56 @@
 # The implementation is based on ULFD, available at
 # https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB
 import types

 import cv2
 import numpy as np
 import torch
 from numpy import random


 class Compose(object):
    """Composes several augmentations together.
    Args:
        transforms (List[Transform]): list of transforms to compose.
    Example:
        >>> augmentations.Compose([
        >>>     transforms.CenterCrop(10),
        >>>     transforms.ToTensor(),
        >>> ])
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, boxes=None, labels=None):
        for t in self.transforms:
            img, boxes, labels = t(img, boxes, labels)
        return img, boxes, labels


 class SubtractMeans(object):

    def __init__(self, mean):
        self.mean = np.array(mean, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        image = image.astype(np.float32)
        image -= self.mean
        return image.astype(np.float32), boxes, labels


 class Resize(object):

    def __init__(self, size=(300, 300)):
        self.size = size

    def __call__(self, image, boxes=None, labels=None):
        image = cv2.resize(image, (self.size[0], self.size[1]))
        return image, boxes, labels


 class ToTensor(object):

    def __call__(self, cvimage, boxes=None, labels=None):
        return torch.from_numpy(cvimage.astype(np.float32)).permute(
            2, 0, 1), boxes, labels
--- a/modelscope/models/cv/face_recognition/align_face.py
+++ b/modelscope/models/cv/face_recognition/align_face.py
@@ -1,3 +1,7 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/blob/master/python-package/insightface/utils/face_align.py
 """
 import cv2
 import numpy as np
 from skimage import transform as trans
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/init.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/init.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone
 from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50,
                         IR_SE_101, IR_SE_152, IR_SE_200)
 from .model_resnet import ResNet_50, ResNet_101, ResNet_152
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/common.py
 import torch
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Linear, Module, ReLU,
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
@@ -1,5 +1,5 @@
 # based on:
 # https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_irse.py
 # The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_irse.py
 from collections import namedtuple

 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
@@ -1,5 +1,5 @@
 # based on:
 # https://github.com/ZhaoJ9014/face.evoLVe.PyTorch/blob/master/backbone/model_resnet.py
 # The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_resnet.py
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                      MaxPool2d, Module, ReLU, Sequential)
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -105,12 +105,12 @@ def get_img_ins_seg_result(img_seg_result=None,
    }
    for seg_result in img_seg_result:

        box = {
            'x': np.int(seg_result[0]),
            'y': np.int(seg_result[1]),
            'w': np.int(seg_result[2] - seg_result[0]),
            'h': np.int(seg_result[3] - seg_result[1])
        }
        box = [
            np.int(seg_result[0]),
            np.int(seg_result[1]),
            np.int(seg_result[2]),
            np.int(seg_result[3])
        ]
        score = np.float(seg_result[4])
        category = seg_result[5]

@@ -161,12 +161,10 @@ def show_result(
            np.random.random() * 255.0
        ])

        x1 = int(box['x'])
        y1 = int(box['y'])
        w = int(box['w'])
        h = int(box['h'])
        x2 = x1 + w
        y2 = y1 + h
        x1 = int(box[0])
        y1 = int(box[1])
        x2 = int(box[2])
        y2 = int(box[3])

        if show_box:
            cv2.rectangle(
--- a/modelscope/models/cv/image_reid_person/pass_model.py
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -1,4 +1,4 @@
 # The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
 # The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID

 import os
--- a/modelscope/models/cv/image_reid_person/transreid_model.py
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -1,4 +1,4 @@
 # The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
 # The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID

 import collections.abc as container_abcs
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -552,7 +552,7 @@ class CLIPVisionTransformer(nn.Module):
                nn.GroupNorm(1, embed_dim),
                nn.ConvTranspose2d(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
                nn.SyncBatchNorm(embed_dim),
                nn.BatchNorm2d(embed_dim),
                nn.GELU(),
                nn.ConvTranspose2d(
                    embed_dim, embed_dim, kernel_size=2, stride=2),