Merge branch 'master-gitlab' into merge_master_internal_1207

2 years ago · 5a3d58ad49
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -7,7 +7,7 @@ gpus='0,1 2,3 4,5 6,7'
 cpu_sets='45-58 31-44 16-30 0-15'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
 CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml'
 CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
 echo "ci command: $CI_COMMAND"
 idx=0
 for gpu in $gpus
--- a/README.md
+++ b/README.md
@@ -1,3 +1,18 @@

 <div align="center">

 [![PyPI](https://img.shields.io/pypi/v/modelscope)](https://pypi.org/project/modelscope/)
 <!-- [![Documentation Status](https://readthedocs.org/projects/easy-cv/badge/?version=latest)](https://easy-cv.readthedocs.io/en/latest/) -->
 [![license](https://img.shields.io/github/license/modelscope/modelscope.svg)](https://github.com/modelscope/modelscope/blob/master/LICENSE)
 [![open issues](https://isitmaintained.com/badge/open/modelscope/modelscope.svg)](https://github.com/modelscope/modelscope/issues)
 [![GitHub pull-requests](https://img.shields.io/github/issues-pr/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/pull/)
 [![GitHub latest commit](https://badgen.net/github/last-commit/modelscope/modelscope)](https://GitHub.com/modelscope/modelscope/commit/)
 <!-- [![GitHub contributors](https://img.shields.io/github/contributors/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/graphs/contributors/) -->
 <!-- [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) -->


 </div>

 # Introduction

 [ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bring together most advanced machine learning models from the AI community, and to streamline the process of leveraging AI models in real applications. The core ModelScope library enables developers to perform inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.
--- a/data/test/audios/3ch_nihaomiya10.wav
+++ b/data/test/audios/3ch_nihaomiya10.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:8ce83bf2a8e6056aba3b3cdc92d2e04d23bdf15a2c1fde814cb091444d59a10b
 size 3180872
--- a/data/test/audios/asr_example_ofa.wav
+++ b/data/test/audios/asr_example_ofa.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:46dbc998c9d1d48111267c40741dd3200f2e5bcf4075f8c4c97f4451160dce50
 size 134570
--- a/data/test/audios/farend_speech1.wav
+++ b/data/test/audios/farend_speech1.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:a8cf9fc5abc119f5b5e246143206c22f488c63e86e47f762585b9edd84e081ad
 size 618160
--- a/data/test/audios/nearend_mic1.wav
+++ b/data/test/audios/nearend_mic1.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:2bc50ef70bbbc46132710b69efa683cf0bf64aeb0990bb3ff411930831bbc17d
 size 619034
--- a/data/test/audios/speech_with_noise1.wav
+++ b/data/test/audios/speech_with_noise1.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:0b2882d3bcd9e8f8f9531ac34ac09c0208d86500b910d3e1ca34c022caa9be62
 size 155874
--- a/data/test/images/image_camouflag_detection.jpg
+++ b/data/test/images/image_camouflag_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:4c713215f7fb4da5382c9137347ee52956a7a44d5979c4cffd3c9b6d1d7e878f
 size 19445
--- a/data/test/images/image_depth_estimation.jpg
+++ b/data/test/images/image_depth_estimation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:3b230497f6ca10be42aed92b86db435d74fd7306746a059b4ad1e0d6b0652806
 size 35694
--- a/data/test/images/license_plate_detection.jpg
+++ b/data/test/images/license_plate_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:209f6ba7f15c9c34a02801b4c6ef33a979f3086702b5229d2e7975eb403c3e15
 size 45819
--- a/data/test/images/mask_face_recognition_1.jpg
+++ b/data/test/images/mask_face_recognition_1.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:e37106cf024efd1886b870fa45f69905fcea202db8a848debc4ccd359ea3b21c
 size 116248
--- a/data/test/images/mask_face_recognition_2.jpg
+++ b/data/test/images/mask_face_recognition_2.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:700f7cb3c958fb710d6b863b3c9aa0549f6ab837dfbe3382f8f750f73cec46e3
 size 116868
--- a/data/test/images/table_recognition.jpg
+++ b/data/test/images/table_recognition.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:f4b7e23f02a35136707ac7862e0a8468797f239e89497351847cfacb2a9c24f6
 size 202112
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030
 size 63349
 oid sha256:dc16ad72e753f751360dab82878ec0a31190fb5125632d8f4698f6537fae79cb
 size 40819
--- a/data/test/videos/video_matting_test.mp4
+++ b/data/test/videos/video_matting_test.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:8e4ade7a6b119e20e82a641246199b4b530759166acc1f813d7cefee65b3e1e0
 size 63944943
--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -104,9 +104,9 @@ git lfs install
 ```

 for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
 and then execute
 ```bash
 wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
 sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
 sudo rpm -ivh your_rpm_file_name.rpm
 git lfs install
 ```

--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -7,7 +7,8 @@ from torch.utils.data.dataloader import default_collate
 from modelscope.exporters.builder import EXPORTERS
 from modelscope.exporters.torch_model_exporter import TorchModelExporter
 from modelscope.metainfo import Models
 from modelscope.preprocessors import Preprocessor, build_preprocessor
 from modelscope.preprocessors import (
    TextClassificationTransformersPreprocessor, build_preprocessor)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModeKeys, Tasks

@@ -59,12 +60,13 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
            'mode': ModeKeys.TRAIN,
            **sequence_length
        })
        preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
        preprocessor: TextClassificationTransformersPreprocessor = build_preprocessor(
            cfg, field_name)
        if pair:
            first_sequence = preprocessor.tokenizer.unk_token
            second_sequence = preprocessor.tokenizer.unk_token
            first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
            second_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
        else:
            first_sequence = preprocessor.tokenizer.unk_token
            first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
            second_sequence = None

        batched = []
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -17,7 +17,7 @@ from modelscope.utils.regress_test_utils import (compare_arguments_nested,
                                                 numpify_tensor_nested)
 from .base import Exporter

 logger = get_logger(__name__)
 logger = get_logger()


 class TorchModelExporter(Exporter):
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -2,6 +2,7 @@

 # yapf: disable
 import datetime
 import functools
 import os
 import pickle
 import platform
@@ -14,10 +15,12 @@ from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import Dict, List, Optional, Tuple, Union

 import requests
 from requests import Session
 from requests.adapters import HTTPAdapter, Retry

 from modelscope import __version__
 from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
 from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
                                      API_RESPONSE_FIELD_DATA,
                                      API_RESPONSE_FIELD_EMAIL,
                                      API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
                                      API_RESPONSE_FIELD_MESSAGE,
@@ -25,7 +28,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                      DEFAULT_CREDENTIALS_PATH,
                                      MODELSCOPE_CLOUD_ENVIRONMENT,
                                      MODELSCOPE_CLOUD_USERNAME,
                                      ONE_YEAR_SECONDS, Licenses,
                                      ONE_YEAR_SECONDS,
                                      REQUESTS_API_HTTP_METHOD, Licenses,
                                      ModelVisibility)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
                                   NotLoginException, NoValidRevisionError,
@@ -54,6 +58,17 @@ class HubApi:
    def __init__(self, endpoint=None):
        self.endpoint = endpoint if endpoint is not None else get_endpoint()
        self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
        self.session = Session()
        retry = Retry(total=2, read=2, connect=2, backoff_factor=1,
                      status_forcelist=(500, 502, 503, 504),)
        adapter = HTTPAdapter(max_retries=retry)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
        # set http timeout
        for method in REQUESTS_API_HTTP_METHOD:
            setattr(self.session,
                    method,
                    functools.partial(getattr(self.session, method), timeout=API_HTTP_CLIENT_TIMEOUT))

    def login(
        self,
@@ -73,7 +88,7 @@ class HubApi:
        </Tip>
        """
        path = f'{self.endpoint}/api/v1/login'
        r = requests.post(
        r = self.session.post(
            path, json={'AccessToken': access_token}, headers=self.headers)
        raise_for_http_status(r)
        d = r.json()
@@ -129,7 +144,7 @@ class HubApi:
            'Visibility': visibility,  # server check
            'License': license
        }
        r = requests.post(
        r = self.session.post(
            path, json=body, cookies=cookies, headers=self.headers)
        handle_http_post_error(r, path, body)
        raise_on_error(r.json())
@@ -150,7 +165,7 @@ class HubApi:
            raise ValueError('Token does not exist, please login first.')
        path = f'{self.endpoint}/api/v1/models/{model_id}'

        r = requests.delete(path, cookies=cookies, headers=self.headers)
        r = self.session.delete(path, cookies=cookies, headers=self.headers)
        raise_for_http_status(r)
        raise_on_error(r.json())

@@ -183,7 +198,7 @@ class HubApi:
        else:
            path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}'

        r = requests.get(path, cookies=cookies, headers=self.headers)
        r = self.session.get(path, cookies=cookies, headers=self.headers)
        handle_http_response(r, logger, cookies, model_id)
        if r.status_code == HTTPStatus.OK:
            if is_ok(r.json()):
@@ -311,7 +326,7 @@ class HubApi:
        """
        cookies = ModelScopeConfig.get_cookies()
        path = f'{self.endpoint}/api/v1/models/'
        r = requests.put(
        r = self.session.put(
            path,
            data='{"Path":"%s", "PageNumber":%s, "PageSize": %s}' %
            (owner_or_group, page_number, page_size),
@@ -360,7 +375,7 @@ class HubApi:
        if cutoff_timestamp is None:
            cutoff_timestamp = get_release_datetime()
        path = f'{self.endpoint}/api/v1/models/{model_id}/revisions?EndTime=%s' % cutoff_timestamp
        r = requests.get(path, cookies=cookies, headers=self.headers)
        r = self.session.get(path, cookies=cookies, headers=self.headers)
        handle_http_response(r, logger, cookies, model_id)
        d = r.json()
        raise_on_error(d)
@@ -422,7 +437,7 @@ class HubApi:
        cookies = self._check_cookie(use_cookies)

        path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
        r = requests.get(path, cookies=cookies, headers=self.headers)
        r = self.session.get(path, cookies=cookies, headers=self.headers)
        handle_http_response(r, logger, cookies, model_id)
        d = r.json()
        raise_on_error(d)
@@ -467,7 +482,7 @@ class HubApi:
        if root is not None:
            path = path + f'&Root={root}'

        r = requests.get(
        r = self.session.get(
            path, cookies=cookies, headers={
                **headers,
                **self.headers
@@ -488,7 +503,7 @@ class HubApi:
    def list_datasets(self):
        path = f'{self.endpoint}/api/v1/datasets'
        params = {}
        r = requests.get(path, params=params, headers=self.headers)
        r = self.session.get(path, params=params, headers=self.headers)
        raise_for_http_status(r)
        dataset_list = r.json()[API_RESPONSE_FIELD_DATA]
        return [x['Name'] for x in dataset_list]
@@ -514,13 +529,13 @@ class HubApi:
        os.makedirs(cache_dir, exist_ok=True)
        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
        cookies = ModelScopeConfig.get_cookies()
        r = requests.get(datahub_url, cookies=cookies)
        r = self.session.get(datahub_url, cookies=cookies)
        resp = r.json()
        datahub_raise_on_error(datahub_url, resp)
        dataset_id = resp['Data']['Id']
        dataset_type = resp['Data']['Type']
        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
        r = requests.get(datahub_url, cookies=cookies, headers=self.headers)
        r = self.session.get(datahub_url, cookies=cookies, headers=self.headers)
        resp = r.json()
        datahub_raise_on_error(datahub_url, resp)
        file_list = resp['Data']
@@ -539,7 +554,7 @@ class HubApi:
            if extension in dataset_meta_format:
                datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                              f'Revision={revision}&FilePath={file_path}'
                r = requests.get(datahub_url, cookies=cookies)
                r = self.session.get(datahub_url, cookies=cookies)
                raise_for_http_status(r)
                local_path = os.path.join(cache_dir, file_path)
                if os.path.exists(local_path):
@@ -584,7 +599,7 @@ class HubApi:
        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                      f'ststoken?Revision={revision}'

        r = requests.get(url=datahub_url, cookies=cookies, headers=self.headers)
        r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
        resp = r.json()
        raise_on_error(resp)
        return resp['Data']
@@ -595,7 +610,7 @@ class HubApi:
            f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'

        cookies = ModelScopeConfig.get_cookies()
        resp = requests.get(url=url, cookies=cookies)
        resp = self.session.get(url=url, cookies=cookies)
        resp = resp.json()
        raise_on_error(resp)
        resp = resp['Data']
@@ -604,7 +619,7 @@ class HubApi:
    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
        cookies = ModelScopeConfig.get_cookies()
        r = requests.post(url, cookies=cookies, headers=self.headers)
        r = self.session.post(url, cookies=cookies, headers=self.headers)
        raise_for_http_status(r)

    def delete_oss_dataset_object(self, object_name: str, dataset_name: str,
@@ -615,7 +630,7 @@ class HubApi:
        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}'

        cookies = self.check_local_cookies(use_cookies=True)
        resp = requests.delete(url=url, cookies=cookies)
        resp = self.session.delete(url=url, cookies=cookies)
        resp = resp.json()
        raise_on_error(resp)
        resp = resp['Message']
@@ -630,16 +645,15 @@ class HubApi:
            f'&Revision={revision}'

        cookies = self.check_local_cookies(use_cookies=True)
        resp = requests.delete(url=url, cookies=cookies)
        resp = self.session.delete(url=url, cookies=cookies)
        resp = resp.json()
        raise_on_error(resp)
        resp = resp['Message']
        return resp

    @staticmethod
    def datahub_remote_call(url):
    def datahub_remote_call(self, url):
        cookies = ModelScopeConfig.get_cookies()
        r = requests.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()})
        r = self.session.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()})
        resp = r.json()
        datahub_raise_on_error(url, resp)
        return resp['Data']
@@ -661,7 +675,7 @@ class HubApi:

        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}'
        cookies = ModelScopeConfig.get_cookies()
        r = requests.post(url, cookies=cookies, headers=self.headers)
        r = self.session.post(url, cookies=cookies, headers=self.headers)
        resp = r.json()
        raise_on_error(resp)
        return resp['Message']
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -11,7 +11,12 @@ MODEL_ID_SEPARATOR = '/'
 FILE_HASH = 'Sha256'
 LOGGER_NAME = 'ModelScopeHub'
 DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials')
 REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete']
 API_HTTP_CLIENT_TIMEOUT = 60
 API_RESPONSE_FIELD_DATA = 'Data'
 API_FILE_DOWNLOAD_RETRY_TIMES = 5
 API_FILE_DOWNLOAD_TIMEOUT = 60 * 5
 API_FILE_DOWNLOAD_CHUNK_SIZE = 4096
 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
 API_RESPONSE_FIELD_USERNAME = 'Username'
 API_RESPONSE_FIELD_EMAIL = 'Email'
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -9,13 +9,16 @@ from pathlib import Path
 from typing import Dict, Optional, Union

 import requests
 from requests.adapters import Retry
 from tqdm import tqdm

 from modelscope import __version__
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
                                      API_FILE_DOWNLOAD_RETRY_TIMES,
                                      API_FILE_DOWNLOAD_TIMEOUT, FILE_HASH)
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
 from modelscope.utils.logger import get_logger
 from .constants import FILE_HASH
 from .errors import FileDownloadError, NotExistError
 from .utils.caching import ModelFileSystemCache
 from .utils.utils import (file_integrity_validation, get_cache_dir,
@@ -184,10 +187,7 @@ def http_get_file(
    headers: Optional[Dict[str, str]] = None,
 ):
    """
    Download remote file. Do not gobble up errors.
    This method is only used by snapshot_download, since the behavior is quite different with single file download
    TODO: consolidate with http_get_file() to avoild duplicate code

    Download remote file, will retry 5 times before giving up on errors.
    Args:
        url(`str`):
            actual download url of the file
@@ -204,30 +204,46 @@ def http_get_file(
    total = -1
    temp_file_manager = partial(
        tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False)

    get_headers = {} if headers is None else copy.deepcopy(headers)
    with temp_file_manager() as temp_file:
        logger.info('downloading %s to %s', url, temp_file.name)
        headers = copy.deepcopy(headers)

        r = requests.get(url, stream=True, headers=headers, cookies=cookies)
        r.raise_for_status()

        content_length = r.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None

        progress = tqdm(
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
            total=total,
            initial=0,
            desc='Downloading',
        )
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:  # filter out keep-alive new chunks
                progress.update(len(chunk))
                temp_file.write(chunk)
        progress.close()
        # retry sleep 0.5s, 1s, 2s, 4s
        retry = Retry(
            total=API_FILE_DOWNLOAD_RETRY_TIMES,
            backoff_factor=1,
            allowed_methods=['GET'])
        while True:
            try:
                downloaded_size = temp_file.tell()
                get_headers['Range'] = 'bytes=%d-' % downloaded_size
                r = requests.get(
                    url,
                    stream=True,
                    headers=get_headers,
                    cookies=cookies,
                    timeout=API_FILE_DOWNLOAD_TIMEOUT)
                r.raise_for_status()
                content_length = r.headers.get('Content-Length')
                total = int(
                    content_length) if content_length is not None else None
                progress = tqdm(
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024,
                    total=total,
                    initial=downloaded_size,
                    desc='Downloading',
                )
                for chunk in r.iter_content(
                        chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE):
                    if chunk:  # filter out keep-alive new chunks
                        progress.update(len(chunk))
                        temp_file.write(chunk)
                progress.close()
                break
            except (Exception) as e:  # no matter what happen, we will retry.
                retry = retry.increment('GET', url, error=e)
                retry.sleep()

    logger.info('storing %s in cache at %s', url, local_dir)
    downloaded_length = os.path.getsize(temp_file.name)
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -94,7 +94,7 @@ class GitCommandWrapper(metaclass=Singleton):
            return False

    def git_lfs_install(self, repo_dir):
        cmd = ['git', '-C', repo_dir, 'lfs', 'install']
        cmd = ['-C', repo_dir, 'lfs', 'install']
        try:
            self._run_git_command(*cmd)
            return True
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -36,14 +36,20 @@ class Models(object):
    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
    text_driven_segmentation = 'text-driven-segmentation'
    newcrfs_depth_estimation = 'newcrfs-depth-estimation'
    resnet50_bert = 'resnet50-bert'
    referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
    fer = 'fer'
    fairface = 'fairface'
    retinaface = 'retinaface'
    shop_segmentation = 'shop-segmentation'
    mogface = 'mogface'
    mtcnn = 'mtcnn'
    ulfd = 'ulfd'
    arcface = 'arcface'
    facemask = 'facemask'
    flc = 'flc'
    tinymog = 'tinymog'
    video_inpainting = 'video-inpainting'
    human_wholebody_keypoint = 'human-wholebody-keypoint'
    hand_static = 'hand-static'
@@ -51,6 +57,7 @@ class Models(object):
    face_emotion = 'face-emotion'
    product_segmentation = 'product-segmentation'
    image_body_reshaping = 'image-body-reshaping'
    video_human_matting = 'video-human-matting'

    # EasyCV models
    yolox = 'YOLOX'
@@ -71,6 +78,7 @@ class Models(object):
    space_T_en = 'space-T-en'
    space_T_cn = 'space-T-cn'
    tcrf = 'transformer-crf'
    token_classification_for_ner = 'token-classification-for-ner'
    tcrf_wseg = 'transformer-crf-for-word-segmentation'
    transformer_softmax = 'transformer-softmax'
    lcrf = 'lstm-crf'
@@ -78,14 +86,17 @@ class Models(object):
    gcnncrf = 'gcnn-crf'
    bart = 'bart'
    gpt3 = 'gpt3'
    gpt_moe = 'gpt-moe'
    gpt_neo = 'gpt-neo'
    plug = 'plug'
    bert_for_ds = 'bert-for-document-segmentation'
    ponet_for_ds = 'ponet-for-document-segmentation'
    ponet = 'ponet'
    T5 = 'T5'
    mglm = 'mglm'
    codegeex = 'codegeex'
    bloom = 'bloom'
    unite = 'unite'

    # audio models
    sambert_hifigan = 'sambert-hifigan'
@@ -152,6 +163,8 @@ class Pipelines(object):
    image_denoise = 'nafnet-image-denoise'
    person_image_cartoon = 'unet-person-image-cartoon'
    ocr_detection = 'resnet18-ocr-detection'
    table_recognition = 'dla34-table-recognition'
    license_plate_detection = 'resnet18-license-plate-detection'
    action_recognition = 'TAdaConv_action-recognition'
    animal_recognition = 'resnet101-animal-recognition'
    general_recognition = 'resnet101-general-recognition'
@@ -166,17 +179,23 @@ class Pipelines(object):
    easycv_segmentation = 'easycv-segmentation'
    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
    salient_detection = 'u2net-salient-detection'
    salient_boudary_detection = 'res2net-salient-detection'
    camouflaged_detection = 'res2net-camouflaged-detection'
    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    card_detection = 'resnet-card-detection-scrfd34gkps'
    ulfd_face_detection = 'manual-face-detection-ulfd'
    tinymog_face_detection = 'manual-face-detection-tinymog'
    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
    facial_landmark_confidence = 'manual-facial-landmark-confidence-flcm'
    face_attribute_recognition = 'resnet34-face-attribute-recognition-fairface'
    retina_face_detection = 'resnet50-face-detection-retinaface'
    mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
    mtcnn_face_detection = 'manual-face-detection-mtcnn'
    live_category = 'live-category'
    general_image_classification = 'vit-base_image-classification_ImageNet-labels'
    daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
    nextvit_small_daily_image_classification = 'nextvit-small_image-classification_Dailylife-labels'
    image_color_enhance = 'csrnet-image-color-enhance'
    virtual_try_on = 'virtual-try-on'
    image_colorization = 'unet-image-colorization'
@@ -187,6 +206,8 @@ class Pipelines(object):
    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
    realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
    face_recognition = 'ir101-face-recognition-cfglint'
    arc_face_recognition = 'ir50-face-recognition-arcface'
    mask_face_recognition = 'resnet-face-recognition-facemask'
    image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
    image2image_translation = 'image-to-image-translation'
    live_category = 'live-category'
@@ -205,6 +226,7 @@ class Pipelines(object):
    video_summarization = 'googlenet_pgl_video_summarization'
    language_guided_video_summarization = 'clip-it-video-summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
    image_depth_estimation = 'image-depth-estimation'
    image_reid_person = 'passvitb-image-reid-person'
    image_inpainting = 'fft-inpainting'
    text_driven_segmentation = 'text-driven-segmentation'
@@ -219,6 +241,7 @@ class Pipelines(object):
    product_segmentation = 'product-segmentation'
    image_body_reshaping = 'flow-based-body-reshaping'
    referring_video_object_segmentation = 'referring-video-object-segmentation'
    video_human_matting = 'video-human-matting'

    # nlp tasks
    automatic_post_editing = 'automatic-post-editing'
@@ -248,6 +271,7 @@ class Pipelines(object):
    text_error_correction = 'text-error-correction'
    plug_generation = 'plug-generation'
    gpt3_generation = 'gpt3-generation'
    gpt_moe_generation = 'gpt-moe-generation'
    faq_question_answering = 'faq-question-answering'
    conversational_text_to_sql = 'conversational-text-to-sql'
    table_question_answering_pipeline = 'table-question-answering-pipeline'
@@ -255,6 +279,7 @@ class Pipelines(object):
    text_ranking = 'text-ranking'
    relation_extraction = 'relation-extraction'
    document_segmentation = 'document-segmentation'
    extractive_summarization = 'extractive-summarization'
    feature_extraction = 'feature-extraction'
    mglm_text_summarization = 'mglm-text-summarization'
    codegeex_code_translation = 'codegeex-code-translation'
@@ -263,6 +288,7 @@ class Pipelines(object):
    translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
    translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
    token_classification = 'token-classification'
    translation_evaluation = 'translation-evaluation'

    # audio tasks
    sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -285,6 +311,7 @@ class Pipelines(object):
    video_multi_modal_embedding = 'video-multi-modal-embedding'
    image_text_retrieval = 'image-text-retrieval'
    ofa_ocr_recognition = 'ofa-ocr-recognition'
    ofa_asr = 'ofa-asr'

    # science tasks
    protein_structure = 'unifold-protein-structure'
@@ -318,6 +345,7 @@ class Trainers(object):
    image_inpainting = 'image-inpainting'
    referring_video_object_segmentation = 'referring-video-object-segmentation'
    image_classification_team = 'image-classification-team'
    image_classification = 'image-classification'

    # nlp trainers
    bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -327,6 +355,8 @@ class Trainers(object):
    nlp_veco_trainer = 'nlp-veco-trainer'
    nlp_text_ranking_trainer = 'nlp-text-ranking-trainer'
    text_generation_trainer = 'text-generation-trainer'
    nlp_plug_trainer = 'nlp-plug-trainer'
    gpt3_trainer = 'nlp-gpt3-trainer'

    # audio trainers
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -352,6 +382,7 @@ class Preprocessors(object):
    image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
    video_summarization_preprocessor = 'video-summarization-preprocessor'
    movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'
    image_classification_bypass_preprocessor = 'image-classification-bypass-preprocessor'

    # nlp preprocessor
    sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -388,6 +419,7 @@ class Preprocessors(object):
    feature_extraction = 'feature-extraction'
    mglm_summarization = 'mglm-summarization'
    sentence_piece = 'sentence-piece'
    translation_evaluation = 'translation-evaluation-preprocessor'

    # audio preprocessor
    linear_aec_fbank = 'linear-aec-fbank'
@@ -489,6 +521,10 @@ class Hooks(object):
    # CLIP logit_scale clamp
    ClipClampLogitScaleHook = 'ClipClampLogitScaleHook'

    # train
    EarlyStopHook = 'EarlyStopHook'
    DeepspeedHook = 'DeepspeedHook'


 class LR_Schedulers(object):
    """learning rate scheduler is defined here
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -19,18 +19,27 @@ from .builder import METRICS, MetricKeys
 class SequenceClassificationMetric(Metric):
    """The metric computation class for sequence classification tasks.

    This metric class calculates accuracy of the whole input batches.
    This metric class calculates accuracy/F1 of all the input batches.

    Args:
        label_name: The key of label column in the 'inputs' arg.
        logit_name: The key of logits column in the 'inputs' arg.
    """

    def __init__(self, *args, **kwargs):
    def __init__(self,
                 label_name=OutputKeys.LABELS,
                 logit_name=OutputKeys.LOGITS,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.preds = []
        self.labels = []
        self.label_name = label_name
        self.logit_name = logit_name

    def add(self, outputs: Dict, inputs: Dict):
        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
        ground_truths = inputs[label_name]
        eval_results = outputs[OutputKeys.LOGITS]
        ground_truths = inputs[self.label_name]
        eval_results = outputs[self.logit_name]
        self.preds.append(
            torch_nested_numpify(torch_nested_detach(eval_results)))
        self.labels.append(
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -18,16 +18,22 @@ class TextGenerationMetric(Metric):
    """The metric computation class for text generation classes.

    This metric class calculates F1 of the rouge scores for the whole evaluation dataset.

    Args:
        target_text: The key of the target text column in the `inputs` arg.
        pred_text: The key of the predicted text column in the `outputs` arg.
    """

    def __init__(self):
    def __init__(self, target_text='tgts', pred_text='preds'):
        self.preds: List[str] = []
        self.tgts: List[str] = []
        self.rouge = Rouge()
        self.target_text = target_text
        self.pred_text = pred_text

    def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]):
        ground_truths = inputs['tgts']
        eval_results = outputs['preds']
        ground_truths = inputs[self.target_text]
        eval_results = outputs[self.pred_text]
        for truth in ground_truths:
            self.tgts.append(rebuild_chinese_str(truth))
        for result in eval_results:
@@ -38,7 +44,7 @@ class TextGenerationMetric(Metric):
        def remove_useless(string: str) -> str:
            return string.replace(' ', '').replace('.', '')

        return remove_useless(pred) and remove_useless(tgt)
        return len(remove_useless(pred)) != 0 and len(remove_useless(tgt)) != 0

    def evaluate(self):
        assert self.preds, 'preds in TextGenerationMetric must not be empty!'
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -21,20 +21,16 @@ class TokenClassificationMetric(Metric):
    This metric class uses seqeval to calculate the scores.

    Args:
        return_entity_level_metrics (bool, *optional*):
        label_name(str, `optional`): The key of label column in the 'inputs' arg.
        logit_name(str, `optional`): The key of logits column in the 'inputs' arg.
        return_entity_level_metrics (bool, `optional`):
            Whether to return every label's detail metrics, default False.
        label2id(dict, `optional`): The label2id information to get the token labels.
    """

    def add(self, outputs: Dict, inputs: Dict):
        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
        ground_truths = inputs[label_name]
        eval_results = outputs[OutputKeys.LOGITS]
        self.preds.append(
            torch_nested_numpify(torch_nested_detach(eval_results)))
        self.labels.append(
            torch_nested_numpify(torch_nested_detach(ground_truths)))

    def __init__(self,
                 label_name=OutputKeys.LABELS,
                 logit_name=OutputKeys.LOGITS,
                 return_entity_level_metrics=False,
                 label2id=None,
                 *args,
@@ -44,6 +40,16 @@ class TokenClassificationMetric(Metric):
        self.preds = []
        self.labels = []
        self.label2id = label2id
        self.label_name = label_name
        self.logit_name = logit_name

    def add(self, outputs: Dict, inputs: Dict):
        ground_truths = inputs[self.label_name]
        eval_results = outputs[self.logit_name]
        self.preds.append(
            torch_nested_numpify(torch_nested_detach(eval_results)))
        self.labels.append(
            torch_nested_numpify(torch_nested_detach(ground_truths)))

    def evaluate(self):
        label2id = self.label2id
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -5,10 +5,11 @@ from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import MODELS, build_model
 from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
 from modelscope.models.builder import build_model
 from modelscope.utils.checkpoint import (save_checkpoint, save_configuration,
                                         save_pretrained)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile
 from modelscope.utils.device import verify_device
 from modelscope.utils.logger import get_logger

@@ -94,6 +95,10 @@ class Model(ABC):
        if prefetched is not None:
            kwargs.pop('model_prefetched')

        invoked_by = kwargs.get(Invoke.KEY)
        if invoked_by is not None:
            kwargs.pop(Invoke.KEY)

        if osp.exists(model_name_or_path):
            local_model_dir = model_name_or_path
        else:
@@ -101,7 +106,13 @@ class Model(ABC):
                raise RuntimeError(
                    'Expecting model is pre-fetched locally, but is not found.'
                )
            local_model_dir = snapshot_download(model_name_or_path, revision)

            if invoked_by is not None:
                invoked_by = '%s/%s' % (Invoke.KEY, invoked_by)
            else:
                invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PRETRAINED)
            local_model_dir = snapshot_download(
                model_name_or_path, revision, user_agent=invoked_by)
        logger.info(f'initialize model from {local_model_dir}')
        if cfg_dict is not None:
            cfg = cfg_dict
@@ -119,11 +130,9 @@ class Model(ABC):
            model_cfg[k] = v
        if device is not None:
            model_cfg.device = device
            model = build_model(
                model_cfg, task_name=task_name, default_args=kwargs)
            model = build_model(model_cfg, task_name=task_name)
        else:
            model = build_model(
                model_cfg, task_name=task_name, default_args=kwargs)
            model = build_model(model_cfg, task_name=task_name)

        # dynamically add pipeline info to model for pipeline inference
        if hasattr(cfg, 'pipeline'):
@@ -132,7 +141,9 @@ class Model(ABC):
        if not hasattr(model, 'cfg'):
            model.cfg = cfg

        model_cfg.pop('model_dir', None)
        model.name = model_name_or_path
        model.model_dir = local_model_dir
        return model

    def save_pretrained(self,
@@ -140,6 +151,7 @@ class Model(ABC):
                        save_checkpoint_names: Union[str, List[str]] = None,
                        save_function: Callable = save_checkpoint,
                        config: Optional[dict] = None,
                        save_config_function: Callable = save_configuration,
                        **kwargs):
        """save the pretrained model, its configuration and other related files to a directory,
            so that it can be re-loaded
@@ -157,18 +169,15 @@ class Model(ABC):
            config (Optional[dict], optional):
            The config for the configuration.json, might not be identical with model.config

            save_config_function (Callble, optional):
            The function to use to save the configuration.

        """
        if config is None and hasattr(self, 'cfg'):
            config = self.cfg
        assert config is not None, 'Cannot save the model because the model config is empty.'
        if isinstance(config, Config):
            config = config.to_dict()
        if 'preprocessor' in config and config['preprocessor'] is not None:
            if 'mode' in config['preprocessor']:
                config['preprocessor']['mode'] = 'inference'
            elif 'val' in config['preprocessor'] and 'mode' in config[
                    'preprocessor']['val']:
                config['preprocessor']['val']['mode'] = 'inference'

        if config is not None:
            save_config_function(target_folder, config)

        save_pretrained(self, target_folder, save_checkpoint_names,
                        save_function, config, **kwargs)
                        save_function, **kwargs)
--- a/modelscope/models/base/base_torch_head.py
+++ b/modelscope/models/base/base_torch_head.py
@@ -6,7 +6,7 @@ import torch
 from modelscope.models.base.base_head import Head
 from modelscope.utils.logger import get_logger

 logger = get_logger(__name__)
 logger = get_logger()


 class TorchHead(Head, torch.nn.Module):
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -6,10 +6,11 @@ import torch
 from torch import nn

 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 from .base_model import Model

 logger = get_logger(__name__)
 logger = get_logger()


 class TorchModel(Model, torch.nn.Module):
--- a/modelscope/models/cv/action_detection/action_detection_onnx.py
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -5,11 +5,14 @@ import os.path as osp
 import shutil
 import subprocess
 import uuid
 from tempfile import TemporaryDirectory
 from urllib.parse import urlparse

 import cv2
 import numpy as np
 import onnxruntime as rt

 from modelscope.hub.file_download import http_get_file
 from modelscope.models import Model
 from modelscope.utils.constant import Devices
 from modelscope.utils.device import verify_device
@@ -22,8 +25,9 @@ class ActionDetONNX(Model):
        model_file = osp.join(config['model_file'])
        device_type, device_id = verify_device(self._device_name)
        options = rt.SessionOptions()
        options.intra_op_num_threads = 1
        options.inter_op_num_threads = 1
        op_num_threads = config.get('op_num_threads', 1)
        options.intra_op_num_threads = op_num_threads
        options.inter_op_num_threads = op_num_threads
        if device_type == Devices.gpu:
            sess = rt.InferenceSession(
                model_file,
@@ -84,37 +88,43 @@ class ActionDetONNX(Model):

    def forward_video(self, video_name, scale):
        min_size, max_size = self._get_sizes(scale)

        tmp_dir = osp.join(
            self.tmp_dir,
            str(uuid.uuid1()) + '_' + osp.basename(video_name)[:-4])
        if osp.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
        os.makedirs(tmp_dir)
        url_parsed = urlparse(video_name)
        frame_rate = 2
        cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
              f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'

        cmd = cmd.split(' ')
        subprocess.call(cmd)

        frame_names = [
            osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
            if name.endswith('.jpg')
        ]
        frame_names = [
            frame_names[i:i + frame_rate * 2]
            for i in range(0,
                           len(frame_names) - frame_rate * 2 + 1, frame_rate
                           * self.temporal_stride)
        ]
        timestamp = list(
            range(1,
                  len(frame_names) * self.temporal_stride,
                  self.temporal_stride))
        batch_imgs = [self.parse_frames(names) for names in frame_names]
        shutil.rmtree(tmp_dir)

        with TemporaryDirectory() as temporary_cache_dir:
            if url_parsed.scheme in ('file', '') and osp.exists(
                    url_parsed.path):
                local_video_name = video_name
            else:
                random_str = str(uuid.uuid1())
                http_get_file(
                    url=video_name,
                    local_dir=temporary_cache_dir,
                    file_name=random_str,
                    headers={},
                    cookies=None)
                local_video_name = osp.join(temporary_cache_dir, random_str)
            cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
                  f' -i {local_video_name} -r {frame_rate} -f' + \
                  f' image2 {temporary_cache_dir}/%06d_out.jpg'
            cmd = cmd.split(' ')
            subprocess.call(cmd)

            frame_names = [
                osp.join(temporary_cache_dir, name)
                for name in sorted(os.listdir(temporary_cache_dir))
                if name.endswith('_out.jpg')
            ]
            frame_names = [
                frame_names[i:i + frame_rate * 2]
                for i in range(0,
                               len(frame_names) - frame_rate * 2
                               + 1, frame_rate * self.temporal_stride)
            ]
            timestamp = list(
                range(1,
                      len(frame_names) * self.temporal_stride,
                      self.temporal_stride))
            batch_imgs = [self.parse_frames(names) for names in frame_names]
        N, _, T, H, W = batch_imgs[0].shape
        scale_min = min_size / min(H, W)
        h, w = min(int(scale_min * H),
--- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -224,8 +224,8 @@ class BodyKeypointsDetection3D(TorchModel):
            lst_pose2d_cannoical.append(pose2d_canonical[:,
                                                         i - pad:i + pad + 1])

        input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
        input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)
        input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0)
        input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0)

        if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
            input_pose2d_abs = input_pose2d_cannoical.clone()
--- a/modelscope/models/cv/face_attribute_recognition/init.py
+++ b/modelscope/models/cv/face_attribute_recognition/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .fair_face import FaceAttributeRecognition

 else:
    _import_structure = {'fair_face': ['FaceAttributeRecognition']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/face_attribute_recognition/fair_face/init.py
+++ b/modelscope/models/cv/face_attribute_recognition/fair_face/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .face_attribute_recognition import FaceAttributeRecognition
--- a/modelscope/models/cv/face_attribute_recognition/fair_face/face_attribute_recognition.py
+++ b/modelscope/models/cv/face_attribute_recognition/fair_face/face_attribute_recognition.py
@@ -0,0 +1,79 @@
 # The implementation is based on FairFace, available at
 # https://github.com/dchen236/FairFace
 import os

 import cv2
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision
 from PIL import Image
 from torch.autograd import Variable
 from torchvision import datasets, models, transforms

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
    Tasks.face_attribute_recognition, module_name=Models.fairface)
 class FaceAttributeRecognition(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
        self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
                                           ModelFile.CONFIGURATION)
        fair_face = torchvision.models.resnet34(pretrained=False)
        fair_face.fc = nn.Linear(fair_face.fc.in_features, 18)
        self.net = fair_face
        self.load_model()
        self.net = self.net.to(device)
        self.trans = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def load_model(self, load_to_cpu=False):
        pretrained_dict = torch.load(
            self.model_path, map_location=torch.device('cpu'))
        self.net.load_state_dict(pretrained_dict, strict=True)
        self.net.eval()

    def forward(self, img):
        """ FariFace model forward process.

        Args:
            img: [h, w, c]

        Return:
            list of attribute result: [gender_score, age_score]
        """
        img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2RGB)
        img = img.astype(np.uint8)

        inputs = self.trans(img)

        c, h, w = inputs.shape

        inputs = inputs.view(-1, c, h, w)
        inputs = inputs.to(self.device)
        inputs = Variable(inputs, volatile=True)
        outputs = self.net(inputs)[0]

        gender_outputs = outputs[7:9]
        age_outputs = outputs[9:18]

        gender_score = F.softmax(gender_outputs).detach().cpu().tolist()
        age_score = F.softmax(age_outputs).detach().cpu().tolist()

        return [gender_score, age_score]
--- a/modelscope/models/cv/face_detection/init.py
+++ b/modelscope/models/cv/face_detection/init.py
@@ -9,13 +9,14 @@ if TYPE_CHECKING:
    from .retinaface import RetinaFaceDetection
    from .ulfd_slim import UlfdFaceDetector
    from .scrfd import ScrfdDetect
    from .scrfd import TinyMogDetect
 else:
    _import_structure = {
        'ulfd_slim': ['UlfdFaceDetector'],
        'retinaface': ['RetinaFaceDetection'],
        'mtcnn': ['MtcnnFaceDetector'],
        'mogface': ['MogFaceDetector'],
        'scrfd': ['ScrfdDetect']
        'scrfd': ['TinyMogDetect', 'ScrfdDetect'],
    }

    import sys
--- a/modelscope/models/cv/face_detection/scrfd/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/init.py
@@ -1,2 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .scrfd_detect import ScrfdDetect
 from .tinymog_detect import TinyMogDetect
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/init.py
@@ -2,6 +2,7 @@
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones
 """
 from .mobilenet import MobileNetV1
 from .resnet import ResNetV1e

 __all__ = ['ResNetV1e']
 __all__ = ['ResNetV1e', 'MobileNetV1']
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/mobilenet.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/mobilenet.py
@@ -0,0 +1,99 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/mobilenet.py
 """

 import torch
 import torch.nn as nn
 from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
                      constant_init, kaiming_init)
 from mmcv.runner import load_checkpoint
 from mmdet.models.builder import BACKBONES
 from mmdet.utils import get_root_logger
 from torch.nn.modules.batchnorm import _BatchNorm


@BACKBONES.register_module()
 class MobileNetV1(nn.Module):

    def __init__(self,
                 in_channels=3,
                 block_cfg=None,
                 num_stages=4,
                 out_indices=(0, 1, 2, 3)):
        super(MobileNetV1, self).__init__()
        self.out_indices = out_indices

        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup), nn.ReLU(inplace=True))

        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.ReLU(inplace=True),
                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU(inplace=True),
            )

        if block_cfg is None:
            stage_planes = [8, 16, 32, 64, 128, 256]
            stage_blocks = [2, 4, 4, 2]
        else:
            stage_planes = block_cfg['stage_planes']
            stage_blocks = block_cfg['stage_blocks']
        assert len(stage_planes) == 6
        assert len(stage_blocks) == 4
        self.stem = nn.Sequential(
            conv_bn(3, stage_planes[0], 2),
            conv_dw(stage_planes[0], stage_planes[1], 1),
        )
        self.stage_layers = []
        for i, num_blocks in enumerate(stage_blocks):
            _layers = []
            for n in range(num_blocks):
                if n == 0:
                    _layer = conv_dw(stage_planes[i + 1], stage_planes[i + 2],
                                     2)
                else:
                    _layer = conv_dw(stage_planes[i + 2], stage_planes[i + 2],
                                     1)
                _layers.append(_layer)

            _block = nn.Sequential(*_layers)
            layer_name = f'layer{i + 1}'
            self.add_module(layer_name, _block)
            self.stage_layers.append(layer_name)

    def forward(self, x):
        output = []
        x = self.stem(x)
        for i, layer_name in enumerate(self.stage_layers):
            stage_layer = getattr(self, layer_name)
            x = stage_layer(x)
            if i in self.out_indices:
                output.append(x)

        return tuple(output)

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        if isinstance(pretrained, str):
            logger = get_root_logger()
            load_checkpoint(self, pretrained, strict=False, logger=logger)
        elif pretrained is None:
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    kaiming_init(m)
                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
                    constant_init(m, 1)
        else:
            raise TypeError('pretrained must be a str or None')
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/init.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/init.py
@@ -3,5 +3,6 @@ The implementation here is modified based on insightface, originally MIT license
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors
 """
 from .scrfd import SCRFD
 from .tinymog import TinyMog

 __all__ = ['SCRFD']
 __all__ = ['SCRFD', 'TinyMog']
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/tinymog.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/tinymog.py
@@ -0,0 +1,148 @@
 """
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
 """
 import torch
 from mmdet.models.builder import DETECTORS
 from mmdet.models.detectors.single_stage import SingleStageDetector

 from ....mmdet_patch.core.bbox import bbox2result


@DETECTORS.register_module()
 class TinyMog(SingleStageDetector):

    def __init__(self,
                 backbone,
                 neck,
                 bbox_head,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None):
        super(TinyMog, self).__init__(backbone, neck, bbox_head, train_cfg,
                                      test_cfg, pretrained)

    def forward_train(self,
                      img,
                      img_metas,
                      gt_bboxes,
                      gt_labels,
                      gt_keypointss=None,
                      gt_bboxes_ignore=None):
        """
        Args:
            img (Tensor): Input images of shape (N, C, H, W).
                Typically these should be mean centered and std scaled.
            img_metas (list[dict]): A List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                :class:`mmdet.datasets.pipelines.Collect`.
            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
                image in [tl_x, tl_y, br_x, br_y] format.
            gt_labels (list[Tensor]): Class indices corresponding to each box
            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
                boxes can be ignored when computing the loss.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        super(SingleStageDetector, self).forward_train(img, img_metas)
        x = self.extract_feat(img)
        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
                                              gt_labels, gt_keypointss,
                                              gt_bboxes_ignore)
        return losses

    def simple_test(self,
                    img,
                    img_metas,
                    rescale=False,
                    repeat_head=1,
                    output_kps_var=0,
                    output_results=1):
        """Test function without test time augmentation.

        Args:
            imgs (list[torch.Tensor]): List of multiple images
            img_metas (list[dict]): List of image information.
            rescale (bool, optional): Whether to rescale the results.
                Defaults to False.
            repeat_head (int): repeat inference times in head
            output_kps_var (int): whether output kps var to calculate quality
            output_results (int): 0: nothing  1: bbox  2: both bbox and kps

        Returns:
            list[list[np.ndarray]]: BBox results of each image and classes.
                The outer list corresponds to each image. The inner list
                corresponds to each class.
        """
        x = self.extract_feat(img)
        assert repeat_head >= 1
        kps_out0 = []
        kps_out1 = []
        kps_out2 = []
        for i in range(repeat_head):
            outs = self.bbox_head(x)
            kps_out0 += [outs[2][0].detach().cpu().numpy()]
            kps_out1 += [outs[2][1].detach().cpu().numpy()]
            kps_out2 += [outs[2][2].detach().cpu().numpy()]
        if output_kps_var:
            var0 = np.var(np.vstack(kps_out0), axis=0).mean()
            var1 = np.var(np.vstack(kps_out1), axis=0).mean()
            var2 = np.var(np.vstack(kps_out2), axis=0).mean()
            var = np.mean([var0, var1, var2])
        else:
            var = None

        if output_results > 0:
            if torch.onnx.is_in_onnx_export():
                cls_score, bbox_pred, kps_pred = outs
                for c in cls_score:
                    print(c.shape)
                for c in bbox_pred:
                    print(c.shape)
                if self.bbox_head.use_kps:
                    for c in kps_pred:
                        print(c.shape)
                    return (cls_score, bbox_pred, kps_pred)
                else:
                    return (cls_score, bbox_pred)
            bbox_list = self.bbox_head.get_bboxes(
                *outs, img_metas, rescale=rescale)

            # return kps if use_kps
            if len(bbox_list[0]) == 2:
                bbox_results = [
                    bbox2result(det_bboxes, det_labels,
                                self.bbox_head.num_classes)
                    for det_bboxes, det_labels in bbox_list
                ]
            elif len(bbox_list[0]) == 3:
                if output_results == 2:
                    bbox_results = [
                        bbox2result(
                            det_bboxes,
                            det_labels,
                            self.bbox_head.num_classes,
                            kps=det_kps,
                            num_kps=self.bbox_head.NK)
                        for det_bboxes, det_labels, det_kps in bbox_list
                    ]
                elif output_results == 1:
                    bbox_results = [
                        bbox2result(det_bboxes, det_labels,
                                    self.bbox_head.num_classes)
                        for det_bboxes, det_labels, _ in bbox_list
                    ]
        else:
            bbox_results = None
        if var is not None:
            return bbox_results, var
        else:
            return bbox_results

    def feature_test(self, img):
        x = self.extract_feat(img)
        outs = self.bbox_head(x)
        return outs
--- a/modelscope/models/cv/face_detection/scrfd/tinymog_detect.py
+++ b/modelscope/models/cv/face_detection/scrfd/tinymog_detect.py
@@ -0,0 +1,67 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from copy import deepcopy
 from typing import Any, Dict

 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 __all__ = ['TinyMogDetect']


@MODELS.register_module(Tasks.face_detection, module_name=Models.tinymog)
 class TinyMogDetect(TorchModel):

    def __init__(self, model_dir, *args, **kwargs):
        """
        initialize the tinymog face detection model from the `model_dir` path.
        """
        super().__init__(model_dir)
        from mmcv import Config
        from mmcv.parallel import MMDataParallel
        from mmcv.runner import load_checkpoint
        from mmdet.models import build_detector
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
        cfg = Config.fromfile(osp.join(model_dir, 'mmcv_tinymog.py'))
        ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
        detector = build_detector(cfg.model)
        logger.info(f'loading model from {ckpt_path}')
        load_checkpoint(detector, ckpt_path, map_location='cpu')
        detector = MMDataParallel(detector)
        detector.eval()
        self.detector = detector
        logger.info('load model done')

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        result = self.detector(
            return_loss=False,
            rescale=True,
            img=[input['img'][0].unsqueeze(0)],
            img_metas=[[dict(input['img_metas'][0].data)]],
            output_results=2)
        assert result is not None
        result = result[0][0]
        bboxes = result[:, :4].tolist()
        kpss = result[:, 5:].tolist()
        scores = result[:, 4].tolist()
        return {
            OutputKeys.SCORES: scores,
            OutputKeys.BOXES: bboxes,
            OutputKeys.KEYPOINTS: kpss
        }

    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        return input
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py
@@ -0,0 +1,200 @@
 # The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
 # https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/backbones/iresnet.py
 import torch
 from torch import nn
 from torch.utils.checkpoint import checkpoint

 using_ckpt = False


 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
        bias=False,
        dilation=dilation)


 def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(
        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


 class IBasicBlock(nn.Module):
    expansion = 1

    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 downsample=None,
                 groups=1,
                 base_width=64,
                 dilation=1):
        super(IBasicBlock, self).__init__()
        if groups != 1 or base_width != 64:
            raise ValueError(
                'BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError(
                'Dilation > 1 not supported in BasicBlock')
        self.bn1 = nn.BatchNorm2d(
            inplanes,
            eps=1e-05,
        )
        self.conv1 = conv3x3(inplanes, planes)
        self.bn2 = nn.BatchNorm2d(
            planes,
            eps=1e-05,
        )
        self.prelu = nn.PReLU(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn3 = nn.BatchNorm2d(
            planes,
            eps=1e-05,
        )
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x
        out = self.bn1(x)
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.prelu(out)
        out = self.conv2(out)
        out = self.bn3(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        return out


 class IResNet(nn.Module):
    fc_scale = 7 * 7

    def __init__(self,
                 block,
                 layers,
                 dropout=0,
                 num_features=512,
                 zero_init_residual=False,
                 groups=1,
                 width_per_group=64,
                 replace_stride_with_dilation=None,
                 fp16=False):
        super(IResNet, self).__init__()
        self.extra_gflops = 0.0
        self.fp16 = fp16
        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError('replace_stride_with_dilation should be None '
                             'or a 3-element tuple, got {}'.format(
                                 replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(
            3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
        self.prelu = nn.PReLU(self.inplanes)
        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
        self.layer2 = self._make_layer(
            block,
            128,
            layers[1],
            stride=2,
            dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(
            block,
            256,
            layers[2],
            stride=2,
            dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(
            block,
            512,
            layers[3],
            stride=2,
            dilate=replace_stride_with_dilation[2])
        self.bn2 = nn.BatchNorm2d(
            512 * block.expansion,
            eps=1e-05,
        )
        self.dropout = nn.Dropout(p=dropout, inplace=True)
        self.fc = nn.Linear(512 * block.expansion * self.fc_scale,
                            num_features)
        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
        nn.init.constant_(self.features.weight, 1.0)
        self.features.weight.requires_grad = False

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, 0, 0.1)
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, IBasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(
                    planes * block.expansion,
                    eps=1e-05,
                ),
            )
        layers = []
        layers.append(
            block(self.inplanes, planes, stride, downsample, self.groups,
                  self.base_width, previous_dilation))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    groups=self.groups,
                    base_width=self.base_width,
                    dilation=self.dilation))

        return nn.Sequential(*layers)

    def forward(self, x):
        with torch.cuda.amp.autocast(self.fp16):
            x = self.conv1(x)
            x = self.bn1(x)
            x = self.prelu(x)
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)
            x = self.bn2(x)
            x = torch.flatten(x, 1)
            x = self.dropout(x)
        x = self.fc(x.float() if self.fp16 else x)
        x = self.features(x)
        return x


 def _iresnet(arch, layers):
    model = IResNet(IBasicBlock, layers)
    return model
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py
@@ -0,0 +1,213 @@
 # The implementation is adopted from InsightFace, made pubicly available under the Apache-2.0 license at
 # https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py

 from collections import namedtuple

 import torch
 import torch.nn.functional as F
 from torch import nn
 from torch.nn import (AdaptiveAvgPool2d, AvgPool2d, BatchNorm1d, BatchNorm2d,
                      Conv2d, Dropout, Dropout2d, Linear, MaxPool2d, Module,
                      Parameter, PReLU, ReLU, Sequential, Sigmoid)


 class Flatten(Module):

    def forward(self, input):
        return input.view(input.size(0), -1)


 class SEModule(Module):

    def __init__(self, channels, reduction):
        super(SEModule, self).__init__()
        self.avg_pool = AdaptiveAvgPool2d(1)
        self.fc1 = Conv2d(
            channels,
            channels // reduction,
            kernel_size=1,
            padding=0,
            bias=False)
        self.relu = ReLU(inplace=True)
        self.fc2 = Conv2d(
            channels // reduction,
            channels,
            kernel_size=1,
            padding=0,
            bias=False)
        self.sigmoid = Sigmoid()

    def forward(self, x):
        module_input = x
        x = self.avg_pool(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return module_input * x


 class BottleneckIR(Module):

    def __init__(self, in_channel, depth, stride):
        super(BottleneckIR, self).__init__()
        if in_channel == depth:
            self.shortcut_layer = MaxPool2d(1, stride)
        else:
            self.shortcut_layer = Sequential(
                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
                BatchNorm2d(depth))
        self.res_layer = Sequential(
            BatchNorm2d(in_channel),
            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
            BatchNorm2d(depth))

    def forward(self, x):
        shortcut = self.shortcut_layer(x)
        res = self.res_layer(x)
        return res + shortcut


 class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
    '''A named tuple describing a ResNet block.'''


 def get_block(in_channel, depth, num_units, stride=2):
    return [Bottleneck(in_channel, depth, stride)
            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]


 def get_blocks(num_layers):
    if num_layers == 50:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=3),
            get_block(in_channel=64, depth=128, num_units=4),
            get_block(in_channel=128, depth=256, num_units=14),
            get_block(in_channel=256, depth=512, num_units=3)
        ]
    elif num_layers == 100:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=3),
            get_block(in_channel=64, depth=128, num_units=13),
            get_block(in_channel=128, depth=256, num_units=30),
            get_block(in_channel=256, depth=512, num_units=3)
        ]
    elif num_layers == 152:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=3),
            get_block(in_channel=64, depth=128, num_units=8),
            get_block(in_channel=128, depth=256, num_units=36),
            get_block(in_channel=256, depth=512, num_units=3)
        ]
    elif num_layers == 252:
        blocks = [
            get_block(in_channel=64, depth=64, num_units=6),
            get_block(in_channel=64, depth=128, num_units=21),
            get_block(in_channel=128, depth=256, num_units=66),
            get_block(in_channel=256, depth=512, num_units=6)
        ]
    return blocks


 class IResNet(Module):

    def __init__(self,
                 dropout=0,
                 num_features=512,
                 zero_init_residual=False,
                 groups=1,
                 width_per_group=64,
                 replace_stride_with_dilation=None,
                 fp16=False,
                 with_wcd=False,
                 wrs_M=400,
                 wrs_q=0.9):
        super(IResNet, self).__init__()
        num_layers = 252
        mode = 'ir'
        assert num_layers in [50, 100, 152,
                              252], 'num_layers should be 50,100, or 152'
        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
        self.fc_scale = 7 * 7
        num_features = 512
        self.fp16 = fp16
        drop_ratio = 0.0
        self.with_wcd = with_wcd
        if self.with_wcd:
            self.wrs_M = wrs_M
            self.wrs_q = wrs_q
        blocks = get_blocks(num_layers)
        if mode == 'ir':
            unit_module = BottleneckIR
        self.input_layer = Sequential(
            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
            PReLU(64))
        self.bn2 = nn.BatchNorm2d(
            512,
            eps=1e-05,
        )
        self.dropout = nn.Dropout(p=drop_ratio, inplace=True)
        self.fc = nn.Linear(512 * self.fc_scale, num_features)
        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
        nn.init.constant_(self.features.weight, 1.0)
        self.features.weight.requires_grad = False

        modules = []
        for block in blocks:
            for bottleneck in block:
                modules.append(
                    unit_module(bottleneck.in_channel, bottleneck.depth,
                                bottleneck.stride))
        self.body = Sequential(*modules)

    def forward(self, x):
        with torch.cuda.amp.autocast(self.fp16):
            x = self.input_layer(x)
            x = self.body(x)
            x = self.bn2(x)
            if self.with_wcd:
                B = x.size()[0]
                C = x.size()[1]
                x_abs = torch.abs(x)
                score = torch.nn.functional.adaptive_avg_pool2d(x_abs,
                                                                1).reshape(
                                                                    (B, C))
                r = torch.rand((B, C), device=x.device)
                key = torch.pow(r, 1. / score)
                _, topidx = torch.topk(key, self.wrs_M, dim=1)
                mask = torch.zeros_like(key, dtype=torch.float32)
                mask.scatter_(1, topidx, 1.)
                maskq = torch.rand((B, C), device=x.device)
                maskq_ones = torch.ones_like(maskq, dtype=torch.float32)
                maskq_zeros = torch.zeros_like(maskq, dtype=torch.float32)
                maskq_m = torch.where(maskq < self.wrs_q, maskq_ones,
                                      maskq_zeros)
                new_mask = mask * maskq_m
                score_sum = torch.sum(score, dim=1, keepdim=True)
                selected_score_sum = torch.sum(
                    new_mask * score, dim=1, keepdim=True)
                alpha = score_sum / (selected_score_sum + 1e-6)
                alpha = alpha.reshape((B, 1, 1, 1))
                new_mask = new_mask.reshape((B, C, 1, 1))
                x = x * new_mask * alpha
            x = torch.flatten(x, 1)
            x = self.dropout(x)
        x = self.fc(x.float() if self.fp16 else x)
        x = self.features(x)
        return x


 def iresnet286(pretrained=False, progress=True, **kwargs):
    model = IResNet(
        dropout=0,
        num_features=512,
        zero_init_residual=False,
        groups=1,
        width_per_group=64,
        replace_stride_with_dilation=None,
        fp16=False,
        with_wcd=False,
        wrs_M=400,
        wrs_q=0.9)
    return model
--- a/modelscope/models/cv/facial_landmark_confidence/init.py
+++ b/modelscope/models/cv/facial_landmark_confidence/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .flc import FacialLandmarkConfidence

 else:
    _import_structure = {'flc': ['FacialLandmarkConfidence']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/facial_landmark_confidence/flc/init.py
+++ b/modelscope/models/cv/facial_landmark_confidence/flc/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .facial_landmark_confidence import FacialLandmarkConfidence
--- a/modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py
+++ b/modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py
@@ -0,0 +1,94 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os

 import cv2
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 import torch.nn.functional as F
 from PIL import Image
 from torch.autograd import Variable

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from .manual_landmark_net import LandmarkConfidence


@MODELS.register_module(
    Tasks.facial_landmark_confidence, module_name=Models.flc)
 class FacialLandmarkConfidence(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
        self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
                                           ModelFile.CONFIGURATION)
        self.landmark_count = 5
        self.net = LandmarkConfidence(landmark_count=self.landmark_count)
        self.load_model()
        self.net = self.net.to(device)

    def load_model(self, load_to_cpu=False):
        pretrained_dict = torch.load(
            self.model_path, map_location=torch.device('cpu'))['state_dict']
        pretrained_dict['rp_net.binary_cls.weight'] = 32.0 * F.normalize(
            pretrained_dict['rp_net.binary_cls.weight'], dim=1).t()
        self.net.load_state_dict(pretrained_dict, strict=True)
        self.net.eval()

    def forward(self, input):
        img_org = input['orig_img']
        bbox = input['bbox']
        img_org = img_org.cpu().numpy()

        image_height = img_org.shape[0]
        image_width = img_org.shape[1]
        x1 = max(0, int(bbox[0]))
        y1 = max(0, int(bbox[1]))
        x2 = min(image_width, int(bbox[2]))
        y2 = min(image_height, int(bbox[3]))
        box_w = x2 - x1 + 1
        box_h = y2 - y1 + 1
        if box_h > box_w:
            delta = box_h - box_w
            dy = edy = 0
            dx = delta // 2
            edx = delta - dx
        else:
            dx = edx = 0
            delta = box_w - box_h
            dy = delta // 2
            edy = delta - dy

        cv_img = img_org[y1:y2, x1:x2]
        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
            cv_img = cv2.copyMakeBorder(cv_img, dy, edy, dx, edx,
                                        cv2.BORDER_CONSTANT, 0)
        inter_x = cv_img.shape[1]
        inter_y = cv_img.shape[0]

        cv_img = cv2.resize(cv_img, (120, 120))

        cv_img = cv_img.transpose((2, 0, 1))

        input_blob = torch.from_numpy(cv_img[np.newaxis, :, :, :].astype(
            np.float32))

        tmp_conf_lms, tmp_feat, tmp_conf_resp, tmp_nose = self.net(
            input_blob.to(self.device))
        conf_lms = tmp_conf_lms.cpu().numpy().squeeze()
        feat = tmp_feat.cpu().numpy().squeeze()

        pts5pt = []
        for i in range(feat.shape[0]):
            if i < self.landmark_count:
                pts5pt.append(feat[i] * inter_x - dx + x1)
            else:
                pts5pt.append(feat[i] * inter_y - dy + y1)

        lm5pt = np.array(pts5pt).reshape(2, 5).T
        return lm5pt, conf_lms
--- a/modelscope/models/cv/facial_landmark_confidence/flc/manual_landmark_net.py
+++ b/modelscope/models/cv/facial_landmark_confidence/flc/manual_landmark_net.py
@@ -0,0 +1,152 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math

 import torch
 import torch.nn.functional as F
 from torch.nn import (AdaptiveAvgPool2d, BatchNorm2d, Conv2d, Linear,
                      MaxPool2d, Module, Parameter, ReLU, Sequential)


 class LandmarkConfidence(Module):

    def __init__(self, landmark_count=5):
        super(LandmarkConfidence, self).__init__()
        self.landmark_net = LandmarkNetD(landmark_count)
        self.landmark_net.eval()
        self.cls_net = ClassNet()
        self.cls_net.eval()
        self.rp_net = RespiratorNet()

    def forward(self, x):
        feat, nose_feat, lms = self.landmark_net(x)
        cls_respirator, nose = self.rp_net(feat, nose_feat)
        confidence = self.cls_net(feat)
        return confidence, lms, cls_respirator, nose


 class FC(Module):

    def __init__(self, feat_dim=256, num_class=2):
        super(FC, self).__init__()
        self.weight = Parameter(
            torch.zeros(num_class, feat_dim, dtype=torch.float32))

    def forward(self, x):
        cos_theta = F.linear(x, self.weight)
        return F.softmax(cos_theta, dim=1)


 class Flatten(Module):

    def forward(self, x):
        return torch.flatten(x, 1)


 class RespiratorNet(Module):

    def __init__(self):
        super(RespiratorNet, self).__init__()
        self.conv1 = Sequential(
            Conv2d(48, 48, 3, 2, 1), BatchNorm2d(48), ReLU(True))
        self.conv2 = AdaptiveAvgPool2d(
            (1, 1)
        )  # Sequential(Conv2d(48, 48, 5, 1, 0), BatchNorm2d(48), ReLU(True))
        self.binary_cls = FC(feat_dim=48, num_class=2)
        self.nose_layer = Sequential(
            Conv2d(48, 64, 3, 1, 0), BatchNorm2d(64), ReLU(True),
            Conv2d(64, 64, 3, 1, 0), BatchNorm2d(64), ReLU(True), Flatten(),
            Linear(64, 96), ReLU(True), Linear(96, 6))

    def train(self, mode=True):
        self.conv1.train(mode)
        self.conv2.train(mode)
        # self.nose_feat.train(mode)
        self.nose_layer.train(mode)
        self.binary_cls.train(mode)

    def forward(self, x, y):
        x = self.conv1(x)
        x = self.conv2(x)
        cls = self.binary_cls(torch.flatten(x, 1))
        # loc = self.nose_feat(y)
        loc = self.nose_layer(y)
        return cls, loc


 class ClassNet(Module):

    def __init__(self):
        super(ClassNet, self).__init__()
        self.conv1 = Sequential(
            Conv2d(48, 48, 3, 1, 1), BatchNorm2d(48), ReLU(True))
        self.conv2 = Sequential(
            Conv2d(48, 54, 3, 2, 1), BatchNorm2d(54), ReLU(True))
        self.conv3 = Sequential(
            Conv2d(54, 54, 5, 1, 0), BatchNorm2d(54), ReLU(True))
        self.fc1 = Sequential(Flatten(), Linear(54, 54), ReLU(True))
        self.fc2 = Linear(54, 1)

    def forward(self, x):
        y = self.conv1(x)
        y = self.conv2(y)
        y = self.conv3(y)
        y = self.fc1(y)
        y = self.fc2(y)
        return y


 class LandmarkNetD(Module):

    def __init__(self, landmark_count=5):
        super(LandmarkNetD, self).__init__()
        self.conv_pre = Sequential(
            Conv2d(3, 16, 5, 2, 0), BatchNorm2d(16), ReLU(True))
        self.pool_pre = MaxPool2d(2, 2)  # output is 29

        self.conv1 = Sequential(
            Conv2d(16, 32, 3, 1, 1), BatchNorm2d(32), ReLU(True),
            Conv2d(32, 32, 3, 1, 1), BatchNorm2d(32), ReLU(True))
        self.pool1 = MaxPool2d(2, 2)  # 14

        self.conv2 = Sequential(
            Conv2d(32, 48, 3, 1, 0), BatchNorm2d(48), ReLU(True),
            Conv2d(48, 48, 3, 1, 0), BatchNorm2d(48), ReLU(True))
        self.pool2 = MaxPool2d(2, 2)  # 5

        self.conv3 = Sequential(
            Conv2d(48, 80, 3, 1, 0), BatchNorm2d(80), ReLU(True),
            Conv2d(80, 80, 3, 1, 0), BatchNorm2d(80), ReLU(True))

        self.fc1 = Sequential(Linear(80, 128), ReLU(True))
        self.fc2 = Sequential(Linear(128, 128), ReLU(True))

        self.output = Linear(128, landmark_count * 2)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x):
        y = self.conv_pre(x)
        y = self.pool_pre(y)
        y = self.conv1(y)
        y = self.pool1(y[:, :, :28, :28])
        feat = self.conv2(y)
        y2 = self.pool2(feat)
        y = self.conv3(y2)
        y = torch.flatten(y, 1)
        y = self.fc1(y)
        y = self.fc2(y)
        y = self.output(y)
        return feat, y2, y
--- a/modelscope/models/cv/image_classification/backbones/init.py
+++ b/modelscope/models/cv/image_classification/backbones/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .nextvit import NextViT
--- a/modelscope/models/cv/image_classification/backbones/nextvit.py
+++ b/modelscope/models/cv/image_classification/backbones/nextvit.py
@@ -0,0 +1,541 @@
 # Part of the implementation is borrowed and modified from Next-ViT,
 # publicly available at https://github.com/bytedance/Next-ViT
 import collections.abc
 import itertools
 import math
 import os
 import warnings
 from functools import partial
 from typing import Dict, Sequence

 import torch
 import torch.nn as nn
 from einops import rearrange
 from mmcls.models.backbones.base_backbone import BaseBackbone
 from mmcls.models.builder import BACKBONES
 from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
 from mmcv.runner import BaseModule
 from torch.nn.modules.batchnorm import _BatchNorm

 NORM_EPS = 1e-5


 def _no_grad_trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # Computes standard normal cumulative distribution function
        return (1. + math.erf(x / math.sqrt(2.))) / 2.

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn(
            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
            'The distribution of values may be incorrect.',
            stacklevel=2)

    with torch.no_grad():
        # Values are generated by using a truncated uniform distribution and
        # then using the inverse CDF for the normal distribution.
        # Get upper and lower cdf values
        ll = norm_cdf((a - mean) / std)
        u = norm_cdf((b - mean) / std)

        # Uniformly fill tensor with values from [ll, u], then translate to
        # [2ll-1, 2u-1].
        tensor.uniform_(2 * ll - 1, 2 * u - 1)

        # Use inverse cdf transform for normal distribution to get truncated
        # standard normal
        tensor.erfinv_()

        # Transform to proper mean, std
        tensor.mul_(std * math.sqrt(2.))
        tensor.add_(mean)

        # Clamp to ensure it's in the proper range
        tensor.clamp_(min=a, max=b)
        return tensor


 def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)


 class ConvBNReLU(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 groups=1):
        super(ConvBNReLU, self).__init__()
        self.conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=1,
            groups=groups,
            bias=False)
        self.norm = nn.BatchNorm2d(out_channels, eps=NORM_EPS)
        self.act = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.norm(x)
        x = self.act(x)
        return x


 def _make_divisible(v, divisor, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


 class PatchEmbed(nn.Module):

    def __init__(self, in_channels, out_channels, stride=1):
        super(PatchEmbed, self).__init__()
        norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS)
        if stride == 2:
            self.avgpool = nn.AvgPool2d((2, 2),
                                        stride=2,
                                        ceil_mode=True,
                                        count_include_pad=False)
            self.conv = nn.Conv2d(
                in_channels, out_channels, kernel_size=1, stride=1, bias=False)
            self.norm = norm_layer(out_channels)
        elif in_channels != out_channels:
            self.avgpool = nn.Identity()
            self.conv = nn.Conv2d(
                in_channels, out_channels, kernel_size=1, stride=1, bias=False)
            self.norm = norm_layer(out_channels)
        else:
            self.avgpool = nn.Identity()
            self.conv = nn.Identity()
            self.norm = nn.Identity()

    def forward(self, x):
        return self.norm(self.conv(self.avgpool(x)))


 class MHCA(nn.Module):
    """
    Multi-Head Convolutional Attention
    """

    def __init__(self, out_channels, head_dim):
        super(MHCA, self).__init__()
        norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS)
        self.group_conv3x3 = nn.Conv2d(
            out_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            groups=out_channels // head_dim,
            bias=False)
        self.norm = norm_layer(out_channels)
        self.act = nn.ReLU(inplace=True)
        self.projection = nn.Conv2d(
            out_channels, out_channels, kernel_size=1, bias=False)

    def forward(self, x):
        out = self.group_conv3x3(x)
        out = self.norm(out)
        out = self.act(out)
        out = self.projection(out)
        return out


 class Mlp(nn.Module):

    def __init__(self,
                 in_features,
                 out_features=None,
                 mlp_ratio=None,
                 drop=0.,
                 bias=True):
        super().__init__()
        out_features = out_features or in_features
        hidden_dim = _make_divisible(in_features * mlp_ratio, 32)
        self.conv1 = nn.Conv2d(
            in_features, hidden_dim, kernel_size=1, bias=bias)
        self.act = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(
            hidden_dim, out_features, kernel_size=1, bias=bias)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.conv1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.conv2(x)
        x = self.drop(x)
        return x


 class NCB(nn.Module):
    """
    Next Convolution Block
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 stride=1,
                 path_dropout=0,
                 drop=0,
                 head_dim=32,
                 mlp_ratio=3):
        super(NCB, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS)
        assert out_channels % head_dim == 0

        self.patch_embed = PatchEmbed(in_channels, out_channels, stride)
        self.mhca = MHCA(out_channels, head_dim)
        self.attention_path_dropout = DropPath(path_dropout)

        self.norm = norm_layer(out_channels)
        self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop, bias=True)
        self.mlp_path_dropout = DropPath(path_dropout)
        self.is_bn_merged = False

    def forward(self, x):
        x = self.patch_embed(x)
        x = x + self.attention_path_dropout(self.mhca(x))
        if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged:
            out = self.norm(x)
        else:
            out = x
        x = x + self.mlp_path_dropout(self.mlp(out))
        return x


 class E_MHSA(nn.Module):
    """
    Efficient Multi-Head Self Attention
    """

    def __init__(self,
                 dim,
                 out_dim=None,
                 head_dim=32,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0,
                 proj_drop=0.,
                 sr_ratio=1):
        super().__init__()
        self.dim = dim
        self.out_dim = out_dim if out_dim is not None else dim
        self.num_heads = self.dim // head_dim
        self.scale = qk_scale or head_dim**-0.5
        self.q = nn.Linear(dim, self.dim, bias=qkv_bias)
        self.k = nn.Linear(dim, self.dim, bias=qkv_bias)
        self.v = nn.Linear(dim, self.dim, bias=qkv_bias)
        self.proj = nn.Linear(self.dim, self.out_dim)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj_drop = nn.Dropout(proj_drop)

        self.sr_ratio = sr_ratio
        self.N_ratio = sr_ratio**2
        if sr_ratio > 1:
            self.sr = nn.AvgPool1d(
                kernel_size=self.N_ratio, stride=self.N_ratio)
            self.norm = nn.BatchNorm1d(dim, eps=NORM_EPS)
        self.is_bn_merge = False

    def forward(self, x):
        B, N, C = x.shape
        q = self.q(x)
        q = q.reshape(B, N, self.num_heads,
                      int(C // self.num_heads)).permute(0, 2, 1, 3)

        if self.sr_ratio > 1:
            x_ = x.transpose(1, 2)
            x_ = self.sr(x_)
            if not torch.onnx.is_in_onnx_export() and not self.is_bn_merge:
                x_ = self.norm(x_)
            x_ = x_.transpose(1, 2)
            k = self.k(x_)
            k = k.reshape(B, -1, self.num_heads,
                          int(C // self.num_heads)).permute(0, 2, 3, 1)
            v = self.v(x_)
            v = v.reshape(B, -1, self.num_heads,
                          int(C // self.num_heads)).permute(0, 2, 1, 3)
        else:
            k = self.k(x)
            k = k.reshape(B, -1, self.num_heads,
                          int(C // self.num_heads)).permute(0, 2, 3, 1)
            v = self.v(x)
            v = v.reshape(B, -1, self.num_heads,
                          int(C // self.num_heads)).permute(0, 2, 1, 3)
        attn = (q @ k) * self.scale

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class NTB(nn.Module):
    """
    Next Transformer Block
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        path_dropout,
        stride=1,
        sr_ratio=1,
        mlp_ratio=2,
        head_dim=32,
        mix_block_ratio=0.75,
        attn_drop=0,
        drop=0,
    ):
        super(NTB, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.mix_block_ratio = mix_block_ratio
        norm_func = partial(nn.BatchNorm2d, eps=NORM_EPS)

        self.mhsa_out_channels = _make_divisible(
            int(out_channels * mix_block_ratio), 32)
        self.mhca_out_channels = out_channels - self.mhsa_out_channels

        self.patch_embed = PatchEmbed(in_channels, self.mhsa_out_channels,
                                      stride)
        self.norm1 = norm_func(self.mhsa_out_channels)
        self.e_mhsa = E_MHSA(
            self.mhsa_out_channels,
            head_dim=head_dim,
            sr_ratio=sr_ratio,
            attn_drop=attn_drop,
            proj_drop=drop)
        self.mhsa_path_dropout = DropPath(path_dropout * mix_block_ratio)

        self.projection = PatchEmbed(
            self.mhsa_out_channels, self.mhca_out_channels, stride=1)
        self.mhca = MHCA(self.mhca_out_channels, head_dim=head_dim)
        self.mhca_path_dropout = DropPath(path_dropout * (1 - mix_block_ratio))

        self.norm2 = norm_func(out_channels)
        self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop)
        self.mlp_path_dropout = DropPath(path_dropout)

        self.is_bn_merged = False

    def forward(self, x):
        x = self.patch_embed(x)
        B, C, H, W = x.shape
        if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged:
            out = self.norm1(x)
        else:
            out = x
        out = rearrange(out, 'b c h w -> b (h w) c')  # b n c
        out = self.mhsa_path_dropout(self.e_mhsa(out))
        x = x + rearrange(out, 'b (h w) c -> b c h w', h=H)

        out = self.projection(x)
        out = out + self.mhca_path_dropout(self.mhca(out))
        x = torch.cat([x, out], dim=1)

        if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged:
            out = self.norm2(x)
        else:
            out = x
        x = x + self.mlp_path_dropout(self.mlp(out))
        return x


@BACKBONES.register_module()
 class NextViT(BaseBackbone):
    stem_chs = {
        'x_small': [64, 32, 64],
        'small': [64, 32, 64],
        'base': [64, 32, 64],
        'large': [64, 32, 64],
    }
    depths = {
        'x_small': [1, 1, 5, 1],
        'small': [3, 4, 10, 3],
        'base': [3, 4, 20, 3],
        'large': [3, 4, 30, 3],
    }

    def __init__(self,
                 arch='small',
                 path_dropout=0.2,
                 attn_drop=0,
                 drop=0,
                 strides=[1, 2, 2, 2],
                 sr_ratios=[8, 4, 2, 1],
                 head_dim=32,
                 mix_block_ratio=0.75,
                 resume='',
                 with_extra_norm=True,
                 norm_eval=False,
                 norm_cfg=None,
                 out_indices=-1,
                 frozen_stages=-1,
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)

        stem_chs = self.stem_chs[arch]
        depths = self.depths[arch]

        self.frozen_stages = frozen_stages
        self.with_extra_norm = with_extra_norm
        self.norm_eval = norm_eval
        self.stage1_out_channels = [96] * (depths[0])
        self.stage2_out_channels = [192] * (depths[1] - 1) + [256]
        self.stage3_out_channels = [384, 384, 384, 384, 512] * (depths[2] // 5)
        self.stage4_out_channels = [768] * (depths[3] - 1) + [1024]
        self.stage_out_channels = [
            self.stage1_out_channels, self.stage2_out_channels,
            self.stage3_out_channels, self.stage4_out_channels
        ]

        # Next Hybrid Strategy
        self.stage1_block_types = [NCB] * depths[0]
        self.stage2_block_types = [NCB] * (depths[1] - 1) + [NTB]
        self.stage3_block_types = [NCB, NCB, NCB, NCB, NTB] * (depths[2] // 5)
        self.stage4_block_types = [NCB] * (depths[3] - 1) + [NTB]
        self.stage_block_types = [
            self.stage1_block_types, self.stage2_block_types,
            self.stage3_block_types, self.stage4_block_types
        ]

        self.stem = nn.Sequential(
            ConvBNReLU(3, stem_chs[0], kernel_size=3, stride=2),
            ConvBNReLU(stem_chs[0], stem_chs[1], kernel_size=3, stride=1),
            ConvBNReLU(stem_chs[1], stem_chs[2], kernel_size=3, stride=1),
            ConvBNReLU(stem_chs[2], stem_chs[2], kernel_size=3, stride=2),
        )
        input_channel = stem_chs[-1]
        features = []
        idx = 0
        dpr = [x.item() for x in torch.linspace(0, path_dropout, sum(depths))
               ]  # stochastic depth decay rule
        for stage_id in range(len(depths)):
            numrepeat = depths[stage_id]
            output_channels = self.stage_out_channels[stage_id]
            block_types = self.stage_block_types[stage_id]
            for block_id in range(numrepeat):
                if strides[stage_id] == 2 and block_id == 0:
                    stride = 2
                else:
                    stride = 1
                output_channel = output_channels[block_id]
                block_type = block_types[block_id]
                if block_type is NCB:
                    layer = NCB(
                        input_channel,
                        output_channel,
                        stride=stride,
                        path_dropout=dpr[idx + block_id],
                        drop=drop,
                        head_dim=head_dim)
                    features.append(layer)
                elif block_type is NTB:
                    layer = NTB(
                        input_channel,
                        output_channel,
                        path_dropout=dpr[idx + block_id],
                        stride=stride,
                        sr_ratio=sr_ratios[stage_id],
                        head_dim=head_dim,
                        mix_block_ratio=mix_block_ratio,
                        attn_drop=attn_drop,
                        drop=drop)
                    features.append(layer)
                input_channel = output_channel
            idx += numrepeat
        self.features = nn.Sequential(*features)
        self.norm = nn.BatchNorm2d(output_channel, eps=NORM_EPS)

        if isinstance(out_indices, int):
            out_indices = [out_indices]
        assert isinstance(out_indices, Sequence), \
            f'"out_indices" must by a sequence or int, ' \
            f'get {type(out_indices)} instead.'
        for i, index in enumerate(out_indices):
            if index < 0:
                out_indices[i] = sum(depths) + index
                assert out_indices[i] >= 0, f'Invalid out_indices {index}'
        self.stage_out_idx = out_indices

        if norm_cfg is not None:
            self = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)

    def init_weights(self):
        super(NextViT, self).init_weights()
        if (isinstance(self.init_cfg, dict)
                and self.init_cfg['type'] == 'Pretrained'):
            # Suppress default init if use pretrained model.
            return

        self._initialize_weights()

    def _initialize_weights(self):
        for n, m in self.named_modules():
            if isinstance(m, (nn.BatchNorm2d,
                              nn.BatchNorm1d)):  # nn.GroupNorm, nn.LayerNorm,
                nn.init.constant_(m.weight, 1.0)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                trunc_normal_(m.weight, std=.02)
                if hasattr(m, 'bias') and m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Conv2d):
                trunc_normal_(m.weight, std=.02)
                if hasattr(m, 'bias') and m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        outputs = list()
        x = self.stem(x)
        stage_id = 0
        for idx, layer in enumerate(self.features):
            x = layer(x)
            if idx == self.stage_out_idx[stage_id]:
                if self.with_extra_norm:
                    x = self.norm(x)
                outputs.append(x)
                stage_id += 1
        return tuple(outputs)

    def _freeze_stages(self):
        if self.frozen_stages > 0:
            self.stem.eval()
            for param in self.stem.parameters():
                param.requires_grad = False
            for idx, layer in enumerate(self.features):
                if idx <= self.stage_out_idx[self.frozen_stages - 1]:
                    layer.eval()
                    for param in layer.parameters():
                        param.requires_grad = False

    def train(self, mode=True):
        super(NextViT, self).train(mode)
        self._freeze_stages()
        if mode and self.norm_eval:
            for m in self.modules():
                # trick: eval have effect on BatchNorm only
                if isinstance(m, _BatchNorm):
                    m.eval()
--- a/modelscope/models/cv/image_classification/mmcls_model.py
+++ b/modelscope/models/cv/image_classification/mmcls_model.py
@@ -1,9 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
@@ -13,16 +14,25 @@ class ClassificationModel(TorchModel):
    def __init__(self, model_dir: str, **kwargs):
        import mmcv
        from mmcls.models import build_classifier
        import modelscope.models.cv.image_classification.backbones
        from modelscope.utils.hub import read_config

        super().__init__(model_dir)

        config = os.path.join(model_dir, 'config.py')

        cfg = mmcv.Config.fromfile(config)
        cfg.model.pretrained = None
        self.cls_model = build_classifier(cfg.model)

        self.config_type = 'ms_config'
        mm_config = os.path.join(model_dir, 'config.py')
        if os.path.exists(mm_config):
            cfg = mmcv.Config.fromfile(mm_config)
            cfg.model.pretrained = None
            self.cls_model = build_classifier(cfg.model)
            self.config_type = 'mmcv_config'
        else:
            cfg = read_config(model_dir)
            cfg.model.mm_model.pretrained = None
            self.cls_model = build_classifier(cfg.model.mm_model)
            self.config_type = 'ms_config'
        self.cfg = cfg

        self.ms_model_dir = model_dir

        self.load_pretrained_checkpoint()
@@ -33,7 +43,13 @@ class ClassificationModel(TorchModel):

    def load_pretrained_checkpoint(self):
        import mmcv
        checkpoint_path = os.path.join(self.ms_model_dir, 'checkpoints.pth')
        if os.path.exists(
                os.path.join(self.ms_model_dir, ModelFile.TORCH_MODEL_FILE)):
            checkpoint_path = os.path.join(self.ms_model_dir,
                                           ModelFile.TORCH_MODEL_FILE)
        else:
            checkpoint_path = os.path.join(self.ms_model_dir,
                                           'checkpoints.pth')
        if os.path.exists(checkpoint_path):
            checkpoint = mmcv.runner.load_checkpoint(
                self.cls_model, checkpoint_path, map_location='cpu')
--- a/modelscope/models/cv/image_classification/utils.py
+++ b/modelscope/models/cv/image_classification/utils.py
@@ -0,0 +1,100 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp

 import numpy as np
 from mmcls.datasets.base_dataset import BaseDataset


 def get_trained_checkpoints_name(work_path):
    import os
    file_list = os.listdir(work_path)
    last = 0
    model_name = None
    # find the best model
    if model_name is None:
        for f_name in file_list:
            if 'best_' in f_name and f_name.endswith('.pth'):
                best_epoch = f_name.replace('.pth', '').split('_')[-1]
                if best_epoch.isdigit():
                    last = int(best_epoch)
                    model_name = f_name
                    return model_name
    # or find the latest model
    if model_name is None:
        for f_name in file_list:
            if 'epoch_' in f_name and f_name.endswith('.pth'):
                epoch_num = f_name.replace('epoch_', '').replace('.pth', '')
                if not epoch_num.isdigit():
                    continue
                ind = int(epoch_num)
                if ind > last:
                    last = ind
                    model_name = f_name
    return model_name


 def preprocess_transform(cfgs):
    if cfgs is None:
        return None
    for i, cfg in enumerate(cfgs):
        if cfg.type == 'Resize':
            if isinstance(cfg.size, list):
                cfgs[i].size = tuple(cfg.size)
    return cfgs


 def get_ms_dataset_root(ms_dataset):
    if ms_dataset is None or len(ms_dataset) < 1:
        return None
    try:
        data_root = ms_dataset[0]['image:FILE'].split('extracted')[0]
        path_post = ms_dataset[0]['image:FILE'].split('extracted')[1].split(
            '/')
        extracted_data_root = osp.join(data_root, 'extracted', path_post[1],
                                       path_post[2])
        return extracted_data_root
    except Exception as e:
        raise ValueError(f'Dataset Error: {e}')
    return None


 def get_classes(classes=None):
    import mmcv
    if isinstance(classes, str):
        # take it as a file path
        class_names = mmcv.list_from_file(classes)
    elif isinstance(classes, (tuple, list)):
        class_names = classes
    else:
        raise ValueError(f'Unsupported type {type(classes)} of classes.')

    return class_names


 class MmDataset(BaseDataset):

    def __init__(self, ms_dataset, pipeline, classes=None, test_mode=False):
        self.ms_dataset = ms_dataset
        if len(self.ms_dataset) < 1:
            raise ValueError('Dataset Error: dataset is empty')
        super(MmDataset, self).__init__(
            data_prefix='',
            pipeline=pipeline,
            classes=classes,
            test_mode=test_mode)

    def load_annotations(self):
        if self.CLASSES is None:
            raise ValueError(
                f'Dataset Error: Not found classesname.txt: {self.CLASSES}')

        data_infos = []
        for data_info in self.ms_dataset:
            filename = data_info['image:FILE']
            gt_label = data_info['category']
            info = {'img_prefix': self.data_prefix}
            info['img_info'] = {'filename': filename}
            info['gt_label'] = np.array(gt_label, dtype=np.int64)
            data_infos.append(info)

        return data_infos
--- a/modelscope/models/cv/image_depth_estimation/init.py
+++ b/modelscope/models/cv/image_depth_estimation/init.py
@@ -0,0 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
--- a/modelscope/models/cv/image_depth_estimation/networks/init.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/init.py
@@ -0,0 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
--- a/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py
@@ -0,0 +1,215 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .newcrf_layers import NewCRF
 from .swin_transformer import SwinTransformer
 from .uper_crf_head import PSP


 class NewCRFDepth(nn.Module):
    """
    Depth network based on neural window FC-CRFs architecture.
    """

    def __init__(self,
                 version=None,
                 inv_depth=False,
                 pretrained=None,
                 frozen_stages=-1,
                 min_depth=0.1,
                 max_depth=100.0,
                 **kwargs):
        super().__init__()

        self.inv_depth = inv_depth
        self.with_auxiliary_head = False
        self.with_neck = False

        norm_cfg = dict(type='BN', requires_grad=True)
        # norm_cfg = dict(type='GN', requires_grad=True, num_groups=8)

        window_size = int(version[-2:])

        if version[:-2] == 'base':
            embed_dim = 128
            depths = [2, 2, 18, 2]
            num_heads = [4, 8, 16, 32]
            in_channels = [128, 256, 512, 1024]
        elif version[:-2] == 'large':
            embed_dim = 192
            depths = [2, 2, 18, 2]
            num_heads = [6, 12, 24, 48]
            in_channels = [192, 384, 768, 1536]
        elif version[:-2] == 'tiny':
            embed_dim = 96
            depths = [2, 2, 6, 2]
            num_heads = [3, 6, 12, 24]
            in_channels = [96, 192, 384, 768]

        backbone_cfg = dict(
            embed_dim=embed_dim,
            depths=depths,
            num_heads=num_heads,
            window_size=window_size,
            ape=False,
            drop_path_rate=0.3,
            patch_norm=True,
            use_checkpoint=False,
            frozen_stages=frozen_stages)

        embed_dim = 512
        decoder_cfg = dict(
            in_channels=in_channels,
            in_index=[0, 1, 2, 3],
            pool_scales=(1, 2, 3, 6),
            channels=embed_dim,
            dropout_ratio=0.0,
            num_classes=32,
            norm_cfg=norm_cfg,
            align_corners=False)

        self.backbone = SwinTransformer(**backbone_cfg)
        # v_dim = decoder_cfg['num_classes'] * 4
        win = 7
        crf_dims = [128, 256, 512, 1024]
        v_dims = [64, 128, 256, embed_dim]
        self.crf3 = NewCRF(
            input_dim=in_channels[3],
            embed_dim=crf_dims[3],
            window_size=win,
            v_dim=v_dims[3],
            num_heads=32)
        self.crf2 = NewCRF(
            input_dim=in_channels[2],
            embed_dim=crf_dims[2],
            window_size=win,
            v_dim=v_dims[2],
            num_heads=16)
        self.crf1 = NewCRF(
            input_dim=in_channels[1],
            embed_dim=crf_dims[1],
            window_size=win,
            v_dim=v_dims[1],
            num_heads=8)
        self.crf0 = NewCRF(
            input_dim=in_channels[0],
            embed_dim=crf_dims[0],
            window_size=win,
            v_dim=v_dims[0],
            num_heads=4)

        self.decoder = PSP(**decoder_cfg)
        self.disp_head1 = DispHead(input_dim=crf_dims[0])

        self.up_mode = 'bilinear'
        if self.up_mode == 'mask':
            self.mask_head = nn.Sequential(
                nn.Conv2d(crf_dims[0], 64, 3, padding=1),
                nn.ReLU(inplace=True), nn.Conv2d(64, 16 * 9, 1, padding=0))

        self.min_depth = min_depth
        self.max_depth = max_depth

        self.init_weights(pretrained=pretrained)

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone and heads.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        # print(f'== Load encoder backbone from: {pretrained}')
        self.backbone.init_weights(pretrained=pretrained)
        self.decoder.init_weights()
        if self.with_auxiliary_head:
            if isinstance(self.auxiliary_head, nn.ModuleList):
                for aux_head in self.auxiliary_head:
                    aux_head.init_weights()
            else:
                self.auxiliary_head.init_weights()

    def upsample_mask(self, disp, mask):
        """ Upsample disp [H/4, W/4, 1] -> [H, W, 1] using convex combination """
        N, _, H, W = disp.shape
        mask = mask.view(N, 1, 9, 4, 4, H, W)
        mask = torch.softmax(mask, dim=2)

        up_disp = F.unfold(disp, kernel_size=3, padding=1)
        up_disp = up_disp.view(N, 1, 9, 1, 1, H, W)

        up_disp = torch.sum(mask * up_disp, dim=2)
        up_disp = up_disp.permute(0, 1, 4, 2, 5, 3)
        return up_disp.reshape(N, 1, 4 * H, 4 * W)

    def forward(self, imgs):

        feats = self.backbone(imgs)
        if self.with_neck:
            feats = self.neck(feats)

        ppm_out = self.decoder(feats)

        e3 = self.crf3(feats[3], ppm_out)
        e3 = nn.PixelShuffle(2)(e3)
        e2 = self.crf2(feats[2], e3)
        e2 = nn.PixelShuffle(2)(e2)
        e1 = self.crf1(feats[1], e2)
        e1 = nn.PixelShuffle(2)(e1)
        e0 = self.crf0(feats[0], e1)

        if self.up_mode == 'mask':
            mask = self.mask_head(e0)
            d1 = self.disp_head1(e0, 1)
            d1 = self.upsample_mask(d1, mask)
        else:
            d1 = self.disp_head1(e0, 4)

        depth = d1 * self.max_depth

        return depth


 class DispHead(nn.Module):

    def __init__(self, input_dim=100):
        super(DispHead, self).__init__()
        # self.norm1 = nn.BatchNorm2d(input_dim)
        self.conv1 = nn.Conv2d(input_dim, 1, 3, padding=1)
        # self.relu = nn.ReLU(inplace=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, scale):
        # x = self.relu(self.norm1(x))
        x = self.sigmoid(self.conv1(x))
        if scale > 1:
            x = upsample(x, scale_factor=scale)
        return x


 class DispUnpack(nn.Module):

    def __init__(self, input_dim=100, hidden_dim=128):
        super(DispUnpack, self).__init__()
        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
        self.conv2 = nn.Conv2d(hidden_dim, 16, 3, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.sigmoid = nn.Sigmoid()
        self.pixel_shuffle = nn.PixelShuffle(4)

    def forward(self, x, output_size):
        x = self.relu(self.conv1(x))
        x = self.sigmoid(self.conv2(x))  # [b, 16, h/4, w/4]
        # x = torch.reshape(x, [x.shape[0], 1, x.shape[2]*4, x.shape[3]*4])
        x = self.pixel_shuffle(x)

        return x


 def upsample(x, scale_factor=2, mode='bilinear', align_corners=False):
    """Upsample input tensor by a factor of 2
    """
    return F.interpolate(
        x, scale_factor=scale_factor, mode=mode, align_corners=align_corners)
--- a/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py
@@ -0,0 +1,504 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_


 class Mlp(nn.Module):
    """ Multilayer perceptron."""

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


 def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows


 def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


 class WindowAttention(nn.Module):
    """ Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self,
                 dim,
                 window_size,
                 num_heads,
                 v_dim,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer('relative_position_index',
                             relative_position_index)

        self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(v_dim, v_dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, v, mask=None):
        """ Forward function.

        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qk = self.qk(x).reshape(B_, N, 2, self.num_heads,
                                C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k = qk[0], qk[
            1]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        # assert self.dim % v.shape[-1] == 0, "self.dim % v.shape[-1] != 0"
        # repeat_num = self.dim // v.shape[-1]
        # v = v.view(B_, N, self.num_heads // repeat_num, -1).transpose(1, 2).repeat(1, repeat_num, 1, 1)

        assert self.dim == v.shape[-1], 'self.dim != v.shape[-1]'
        v = v.view(B_, N, self.num_heads, -1).transpose(1, 2)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class CRFBlock(nn.Module):
    """ CRF Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self,
                 dim,
                 num_heads,
                 v_dim,
                 window_size=7,
                 shift_size=0,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.v_dim = v_dim
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            v_dim=v_dim,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(v_dim)
        mlp_hidden_dim = int(v_dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=v_dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        self.H = None
        self.W = None

    def forward(self, x, v, mask_matrix):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, 'input feature has wrong size'

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        v = F.pad(v, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(
                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            shifted_v = torch.roll(
                v, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            attn_mask = mask_matrix
        else:
            shifted_x = x
            shifted_v = v
            attn_mask = None

        # partition windows
        x_windows = window_partition(
            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size,
                                   C)  # nW*B, window_size*window_size, C
        v_windows = window_partition(
            shifted_v, self.window_size)  # nW*B, window_size, window_size, C
        v_windows = v_windows.view(
            -1, self.window_size * self.window_size,
            v_windows.shape[-1])  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(
            x_windows, v_windows,
            mask=attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size,
                                         self.window_size, self.v_dim)
        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                   Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, self.v_dim)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


 class BasicCRFLayer(nn.Module):
    """ A basic NeWCRFs layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 dim,
                 depth,
                 num_heads,
                 v_dim,
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None,
                 use_checkpoint=False):
        super().__init__()
        self.window_size = window_size
        self.shift_size = window_size // 2
        self.depth = depth
        self.use_checkpoint = use_checkpoint

        # build blocks
        self.blocks = nn.ModuleList([
            CRFBlock(
                dim=dim,
                num_heads=num_heads,
                v_dim=v_dim,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop,
                attn_drop=attn_drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, list) else drop_path,
                norm_layer=norm_layer) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x, v, H, W):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """

        # calculate attention mask for SW-MSA
        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt
                cnt += 1

        mask_windows = window_partition(
            img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.view(-1,
                                         self.window_size * self.window_size)
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        attn_mask = attn_mask.masked_fill(attn_mask != 0,
                                          float(-100.0)).masked_fill(
                                              attn_mask == 0, float(0.0))

        for blk in self.blocks:
            blk.H, blk.W = H, W
            if self.use_checkpoint:
                x = checkpoint.checkpoint(blk, x, attn_mask)
            else:
                x = blk(x, v, attn_mask)
        if self.downsample is not None:
            x_down = self.downsample(x, H, W)
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


 class NewCRF(nn.Module):

    def __init__(self,
                 input_dim=96,
                 embed_dim=96,
                 v_dim=64,
                 window_size=7,
                 num_heads=4,
                 depth=2,
                 patch_size=4,
                 in_chans=3,
                 norm_layer=nn.LayerNorm,
                 patch_norm=True):
        super().__init__()

        self.embed_dim = embed_dim
        self.patch_norm = patch_norm

        if input_dim != embed_dim:
            self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1)
        else:
            self.proj_x = None

        if v_dim != embed_dim:
            self.proj_v = nn.Conv2d(v_dim, embed_dim, 3, padding=1)
        elif embed_dim % v_dim == 0:
            self.proj_v = None

        v_dim = embed_dim
        assert v_dim == embed_dim

        self.crf_layer = BasicCRFLayer(
            dim=embed_dim,
            depth=depth,
            num_heads=num_heads,
            v_dim=v_dim,
            window_size=window_size,
            mlp_ratio=4.,
            qkv_bias=True,
            qk_scale=None,
            drop=0.,
            attn_drop=0.,
            drop_path=0.,
            norm_layer=norm_layer,
            downsample=None,
            use_checkpoint=False)

        layer = norm_layer(embed_dim)
        layer_name = 'norm_crf'
        self.add_module(layer_name, layer)

    def forward(self, x, v):
        if self.proj_x is not None:
            x = self.proj_x(x)
        if self.proj_v is not None:
            v = self.proj_v(v)

        Wh, Ww = x.size(2), x.size(3)
        x = x.flatten(2).transpose(1, 2)
        v = v.transpose(1, 2).transpose(2, 3)

        x_out, H, W, x, Wh, Ww = self.crf_layer(x, v, Wh, Ww)
        norm_layer = getattr(self, 'norm_crf')
        x_out = norm_layer(x_out)
        out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1,
                                                           2).contiguous()

        return out
--- a/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py
@@ -0,0 +1,272 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import os.path as osp
 import pkgutil
 import warnings
 from collections import OrderedDict
 from importlib import import_module

 import torch
 import torch.nn as nn
 import torchvision
 from torch import distributed as dist
 from torch.nn import functional as F
 from torch.nn.parallel import DataParallel, DistributedDataParallel
 from torch.utils import model_zoo

 TORCH_VERSION = torch.__version__


 def resize(input,
           size=None,
           scale_factor=None,
           mode='nearest',
           align_corners=None,
           warning=True):
    if warning:
        if size is not None and align_corners:
            input_h, input_w = tuple(int(x) for x in input.shape[2:])
            output_h, output_w = tuple(int(x) for x in size)
            if output_h > input_h or output_w > output_h:
                if ((output_h > 1 and output_w > 1 and input_h > 1
                     and input_w > 1) and (output_h - 1) % (input_h - 1)
                        and (output_w - 1) % (input_w - 1)):
                    warnings.warn(
                        f'When align_corners={align_corners}, '
                        'the output would more aligned if '
                        f'input size {(input_h, input_w)} is `x+1` and '
                        f'out size {(output_h, output_w)} is `nx+1`')
    if isinstance(size, torch.Size):
        size = tuple(int(x) for x in size)
    return F.interpolate(input, size, scale_factor, mode, align_corners)


 def normal_init(module, mean=0, std=1, bias=0):
    if hasattr(module, 'weight') and module.weight is not None:
        nn.init.normal_(module.weight, mean, std)
    if hasattr(module, 'bias') and module.bias is not None:
        nn.init.constant_(module.bias, bias)


 def is_module_wrapper(module):
    module_wrappers = (DataParallel, DistributedDataParallel)
    return isinstance(module, module_wrappers)


 def get_dist_info():
    if TORCH_VERSION < '1.0':
        initialized = dist._initialized
    else:
        if dist.is_available():
            initialized = dist.is_initialized()
        else:
            initialized = False
    if initialized:
        rank = dist.get_rank()
        world_size = dist.get_world_size()
    else:
        rank = 0
        world_size = 1
    return rank, world_size


 def load_state_dict(module, state_dict, strict=False, logger=None):
    """Load state_dict to a module.

    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
    Default value for ``strict`` is set to ``False`` and the message for
    param mismatch will be shown even if strict is False.

    Args:
        module (Module): Module that receives the state_dict.
        state_dict (OrderedDict): Weights.
        strict (bool): whether to strictly enforce that the keys
            in :attr:`state_dict` match the keys returned by this module's
            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
        logger (:obj:`logging.Logger`, optional): Logger to log the error
            message. If not specified, print function will be used.
    """
    unexpected_keys = []
    all_missing_keys = []
    err_msg = []

    metadata = getattr(state_dict, '_metadata', None)
    state_dict = state_dict.copy()
    if metadata is not None:
        state_dict._metadata = metadata

    # use _load_from_state_dict to enable checkpoint version control
    def load(module, prefix=''):
        # recursively check parallel module in case that the model has a
        # complicated structure, e.g., nn.Module(nn.Module(DDP))
        if is_module_wrapper(module):
            module = module.module
        local_metadata = {} if metadata is None else metadata.get(
            prefix[:-1], {})
        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
                                     all_missing_keys, unexpected_keys,
                                     err_msg)
        for name, child in module._modules.items():
            if child is not None:
                load(child, prefix + name + '.')

    load(module)
    load = None  # break load->load reference cycle

    # ignore "num_batches_tracked" of BN layers
    missing_keys = [
        key for key in all_missing_keys if 'num_batches_tracked' not in key
    ]

    if unexpected_keys:
        err_msg.append('unexpected key in source '
                       f'state_dict: {", ".join(unexpected_keys)}\n')
    if missing_keys:
        err_msg.append(
            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')

    rank, _ = get_dist_info()
    if len(err_msg) > 0 and rank == 0:
        err_msg.insert(
            0, 'The model and loaded state dict do not match exactly\n')
        err_msg = '\n'.join(err_msg)
        if strict:
            raise RuntimeError(err_msg)
        elif logger is not None:
            logger.warning(err_msg)
        else:
            print(err_msg)


 def load_url_dist(url, model_dir=None):
    """In distributed setting, this function only download checkpoint at local
    rank 0."""
    rank, world_size = get_dist_info()
    rank = int(os.environ.get('LOCAL_RANK', rank))
    if rank == 0:
        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
    if world_size > 1:
        torch.distributed.barrier()
        if rank > 0:
            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
    return checkpoint


 def get_torchvision_models():
    model_urls = dict()
    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
        if ispkg:
            continue
        _zoo = import_module(f'torchvision.models.{name}')
        if hasattr(_zoo, 'model_urls'):
            _urls = getattr(_zoo, 'model_urls')
            model_urls.update(_urls)
    return model_urls


 def _load_checkpoint(filename, map_location=None):
    """Load checkpoint from somewhere (modelzoo, file, url).

    Args:
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str | None): Same as :func:`torch.load`. Default: None.

    Returns:
        dict | OrderedDict: The loaded checkpoint. It can be either an
            OrderedDict storing model weights or a dict containing other
            information, which depends on the checkpoint.
    """
    if filename.startswith('modelzoo://'):
        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
                      'use "torchvision://" instead')
        model_urls = get_torchvision_models()
        model_name = filename[11:]
        checkpoint = load_url_dist(model_urls[model_name])
    else:
        if not osp.isfile(filename):
            raise IOError(f'{filename} is not a checkpoint file')
        checkpoint = torch.load(filename, map_location=map_location)
    return checkpoint


 def load_checkpoint(model,
                    filename,
                    map_location='cpu',
                    strict=False,
                    logger=None):
    """Load checkpoint from a file or URI.

    Args:
        model (Module): Module to load checkpoint.
        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
            details.
        map_location (str): Same as :func:`torch.load`.
        strict (bool): Whether to allow different params for the model and
            checkpoint.
        logger (:mod:`logging.Logger` or None): The logger for error message.

    Returns:
        dict or OrderedDict: The loaded checkpoint.
    """
    checkpoint = _load_checkpoint(filename, map_location)
    # OrderedDict is a subclass of dict
    if not isinstance(checkpoint, dict):
        raise RuntimeError(
            f'No state_dict found in checkpoint file {filename}')
    # get state_dict from checkpoint
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    elif 'model' in checkpoint:
        state_dict = checkpoint['model']
    else:
        state_dict = checkpoint
    # strip prefix of state_dict
    if list(state_dict.keys())[0].startswith('module.'):
        state_dict = {k[7:]: v for k, v in state_dict.items()}

    # for MoBY, load model of online branch
    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
        state_dict = {
            k.replace('encoder.', ''): v
            for k, v in state_dict.items() if k.startswith('encoder.')
        }

    # reshape absolute position embedding
    if state_dict.get('absolute_pos_embed') is not None:
        absolute_pos_embed = state_dict['absolute_pos_embed']
        N1, L, C1 = absolute_pos_embed.size()
        N2, C2, H, W = model.absolute_pos_embed.size()
        if N1 != N2 or C1 != C2 or L != H * W:
            logger.warning('Error in loading absolute_pos_embed, pass')
        else:
            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
                N2, H, W, C2).permute(0, 3, 1, 2)

    # interpolate position bias table if needed
    relative_position_bias_table_keys = [
        k for k in state_dict.keys() if 'relative_position_bias_table' in k
    ]
    for table_key in relative_position_bias_table_keys:
        table_pretrained = state_dict[table_key]
        table_current = model.state_dict()[table_key]
        L1, nH1 = table_pretrained.size()
        L2, nH2 = table_current.size()
        if nH1 != nH2:
            logger.warning(f'Error in loading {table_key}, pass')
        else:
            if L1 != L2:
                S1 = int(L1**0.5)
                S2 = int(L2**0.5)
                table_pretrained_resized = F.interpolate(
                    table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
                    size=(S2, S2),
                    mode='bicubic')
                state_dict[table_key] = table_pretrained_resized.view(
                    nH2, L2).permute(1, 0)

    # load state_dict
    load_state_dict(model, state_dict, strict, logger)
    return checkpoint
--- a/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py
@@ -0,0 +1,706 @@
 # The implementation is adopted from Swin Transformer
 # made publicly available under the MIT License at https://github.com/microsoft/Swin-Transformer

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_

 from .newcrf_utils import load_checkpoint


 class Mlp(nn.Module):
    """ Multilayer perceptron."""

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


 def window_partition(x, window_size):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows


 def window_reverse(windows, window_size, H, W):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x


 class WindowAttention(nn.Module):
    """ Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(self,
                 dim,
                 window_size,
                 num_heads,
                 qkv_bias=True,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer('relative_position_index',
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        trunc_normal_(self.relative_position_bias_table, std=.02)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        """ Forward function.

        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1],
                self.window_size[0] * self.window_size[1],
                -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        attn = attn + relative_position_bias.unsqueeze(0)

        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N,
                             N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)

        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class SwinTransformerBlock(nn.Module):
    """ Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self,
                 dim,
                 num_heads,
                 window_size=7,
                 shift_size=0,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        self.H = None
        self.W = None

    def forward(self, x, mask_matrix):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        """
        B, L, C = x.shape
        H, W = self.H, self.W
        assert L == H * W, 'input feature has wrong size'

        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)

        # pad feature maps to multiples of window size
        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(
                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
            attn_mask = mask_matrix
        else:
            shifted_x = x
            attn_mask = None

        # partition windows
        x_windows = window_partition(
            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
        x_windows = x_windows.view(-1, self.window_size * self.window_size,
                                   C)  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(
            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = attn_windows.view(-1, self.window_size,
                                         self.window_size, C)
        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
                                   Wp)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(
                shifted_x,
                shifts=(self.shift_size, self.shift_size),
                dims=(1, 2))
        else:
            x = shifted_x

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x


 class PatchMerging(nn.Module):
    """ Patch Merging Layer

    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(self, dim, norm_layer=nn.LayerNorm):
        super().__init__()
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = norm_layer(4 * dim)

    def forward(self, x, H, W):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """
        B, L, C = x.shape
        assert L == H * W, 'input feature has wrong size'

        x = x.view(B, H, W, C)

        # padding
        pad_input = (H % 2 == 1) or (W % 2 == 1)
        if pad_input:
            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))

        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C

        x = self.norm(x)
        x = self.reduction(x)

        return x


 class BasicLayer(nn.Module):
    """ A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 dim,
                 depth,
                 num_heads,
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 norm_layer=nn.LayerNorm,
                 downsample=None,
                 use_checkpoint=False):
        super().__init__()
        self.window_size = window_size
        self.shift_size = window_size // 2
        self.depth = depth
        self.use_checkpoint = use_checkpoint

        # build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(
                dim=dim,
                num_heads=num_heads,
                window_size=window_size,
                shift_size=0 if (i % 2 == 0) else window_size // 2,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop,
                attn_drop=attn_drop,
                drop_path=drop_path[i]
                if isinstance(drop_path, list) else drop_path,
                norm_layer=norm_layer) for i in range(depth)
        ])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def forward(self, x, H, W):
        """ Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        """

        # calculate attention mask for SW-MSA
        Hp = int(np.ceil(H / self.window_size)) * self.window_size
        Wp = int(np.ceil(W / self.window_size)) * self.window_size
        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
        h_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        w_slices = (slice(0, -self.window_size),
                    slice(-self.window_size,
                          -self.shift_size), slice(-self.shift_size, None))
        cnt = 0
        for h in h_slices:
            for w in w_slices:
                img_mask[:, h, w, :] = cnt
                cnt += 1

        mask_windows = window_partition(
            img_mask, self.window_size)  # nW, window_size, window_size, 1
        mask_windows = mask_windows.view(-1,
                                         self.window_size * self.window_size)
        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
        attn_mask = attn_mask.masked_fill(attn_mask != 0,
                                          float(-100.0)).masked_fill(
                                              attn_mask == 0, float(0.0))

        for blk in self.blocks:
            blk.H, blk.W = H, W
            if self.use_checkpoint:
                x = checkpoint.checkpoint(blk, x, attn_mask)
            else:
                x = blk(x, attn_mask)
        if self.downsample is not None:
            x_down = self.downsample(x, H, W)
            Wh, Ww = (H + 1) // 2, (W + 1) // 2
            return x, H, W, x_down, Wh, Ww
        else:
            return x, H, W, x, H, W


 class PatchEmbed(nn.Module):
    """ Image to Patch Embedding

    Args:
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    """

    def __init__(self,
                 patch_size=4,
                 in_chans=3,
                 embed_dim=96,
                 norm_layer=None):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        self.patch_size = patch_size

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None

    def forward(self, x):
        """Forward function."""
        # padding
        _, _, H, W = x.size()
        if W % self.patch_size[1] != 0:
            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
        if H % self.patch_size[0] != 0:
            x = F.pad(x,
                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))

        x = self.proj(x)  # B C Wh Ww
        if self.norm is not None:
            Wh, Ww = x.size(2), x.size(3)
            x = x.flatten(2).transpose(1, 2)
            x = self.norm(x)
            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)

        return x


 class SwinTransformer(nn.Module):
    """ Swin Transformer backbone.
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        pretrain_img_size (int): Input image size for training the pretrained model,
            used in absolute postion embedding. Default 224.
        patch_size (int | tuple(int)): Patch size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each Swin Transformer stage.
        num_heads (tuple[int]): Number of attention head of each stage.
        window_size (int): Window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float): Dropout rate.
        attn_drop_rate (float): Attention dropout rate. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    """

    def __init__(self,
                 pretrain_img_size=224,
                 patch_size=4,
                 in_chans=3,
                 embed_dim=96,
                 depths=[2, 2, 6, 2],
                 num_heads=[3, 6, 12, 24],
                 window_size=7,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.2,
                 norm_layer=nn.LayerNorm,
                 ape=False,
                 patch_norm=True,
                 out_indices=(0, 1, 2, 3),
                 frozen_stages=-1,
                 use_checkpoint=False):
        super().__init__()

        self.pretrain_img_size = pretrain_img_size
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.out_indices = out_indices
        self.frozen_stages = frozen_stages

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)

        # absolute position embedding
        if self.ape:
            pretrain_img_size = to_2tuple(pretrain_img_size)
            patch_size = to_2tuple(patch_size)
            patches_resolution = [
                pretrain_img_size[0] // patch_size[0],
                pretrain_img_size[1] // patch_size[1]
            ]

            self.absolute_pos_embed = nn.Parameter(
                torch.zeros(1, embed_dim, patches_resolution[0],
                            patches_resolution[1]))
            trunc_normal_(self.absolute_pos_embed, std=.02)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]  # stochastic depth decay rule

        # build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if
                (i_layer < self.num_layers - 1) else None,
                use_checkpoint=use_checkpoint)
            self.layers.append(layer)

        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
        self.num_features = num_features

        # add a norm layer for each output
        for i_layer in out_indices:
            layer = norm_layer(num_features[i_layer])
            layer_name = f'norm{i_layer}'
            self.add_module(layer_name, layer)

        self._freeze_stages()

    def _freeze_stages(self):
        if self.frozen_stages >= 0:
            self.patch_embed.eval()
            for param in self.patch_embed.parameters():
                param.requires_grad = False

        if self.frozen_stages >= 1 and self.ape:
            self.absolute_pos_embed.requires_grad = False

        if self.frozen_stages >= 2:
            self.pos_drop.eval()
            for i in range(0, self.frozen_stages - 1):
                m = self.layers[i]
                m.eval()
                for param in m.parameters():
                    param.requires_grad = False

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """

        def _init_weights(m):
            if isinstance(m, nn.Linear):
                trunc_normal_(m.weight, std=.02)
                if isinstance(m, nn.Linear) and m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                nn.init.constant_(m.bias, 0)
                nn.init.constant_(m.weight, 1.0)

        if isinstance(pretrained, str):
            self.apply(_init_weights)
            # logger = get_root_logger()
            load_checkpoint(self, pretrained, strict=False)
        elif pretrained is None:
            self.apply(_init_weights)
        else:
            raise TypeError('pretrained must be a str or None')

    def forward(self, x):
        """Forward function."""
        x = self.patch_embed(x)

        Wh, Ww = x.size(2), x.size(3)
        if self.ape:
            # interpolate the position embedding to the corresponding size
            absolute_pos_embed = F.interpolate(
                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
            x = (x + absolute_pos_embed).flatten(2).transpose(1,
                                                              2)  # B Wh*Ww C
        else:
            x = x.flatten(2).transpose(1, 2)
        x = self.pos_drop(x)

        outs = []
        for i in range(self.num_layers):
            layer = self.layers[i]
            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)

            if i in self.out_indices:
                norm_layer = getattr(self, f'norm{i}')
                x_out = norm_layer(x_out)

                out = x_out.view(-1, H, W,
                                 self.num_features[i]).permute(0, 3, 1,
                                                               2).contiguous()
                outs.append(out)

        return tuple(outs)

    def train(self, mode=True):
        """Convert the model into training mode while keep layers freezed."""
        super(SwinTransformer, self).train(mode)
        self._freeze_stages()
--- a/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py
+++ b/modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py
@@ -0,0 +1,365 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule

 from .newcrf_utils import normal_init, resize


 class PPM(nn.ModuleList):
    """Pooling Pyramid Module used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        conv_cfg (dict|None): Config of conv layers.
        norm_cfg (dict|None): Config of norm layers.
        act_cfg (dict): Config of activation layers.
        align_corners (bool): align_corners argument of F.interpolate.
    """

    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
                 act_cfg, align_corners):
        super(PPM, self).__init__()
        self.pool_scales = pool_scales
        self.align_corners = align_corners
        self.in_channels = in_channels
        self.channels = channels
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        for pool_scale in pool_scales:
            # == if batch size = 1, BN is not supported, change to GN
            if pool_scale == 1:
                norm_cfg = dict(type='GN', requires_grad=True, num_groups=256)
            self.append(
                nn.Sequential(
                    nn.AdaptiveAvgPool2d(pool_scale),
                    ConvModule(
                        self.in_channels,
                        self.channels,
                        1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=norm_cfg,
                        act_cfg=self.act_cfg)))

    def forward(self, x):
        """Forward function."""
        ppm_outs = []
        for ppm in self:
            ppm_out = ppm(x)
            upsampled_ppm_out = resize(
                ppm_out,
                size=x.size()[2:],
                mode='bilinear',
                align_corners=self.align_corners)
            ppm_outs.append(upsampled_ppm_out)
        return ppm_outs


 class BaseDecodeHead(nn.Module):
    """Base class for BaseDecodeHead.

    Args:
        in_channels (int|Sequence[int]): Input channels.
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
        conv_cfg (dict|None): Config of conv layers. Default: None.
        norm_cfg (dict|None): Config of norm layers. Default: None.
        act_cfg (dict): Config of activation layers.
            Default: dict(type='ReLU')
        in_index (int|Sequence[int]): Input feature index. Default: -1
        input_transform (str|None): Transformation type of input features.
            Options: 'resize_concat', 'multiple_select', None.
            'resize_concat': Multiple feature maps will be resize to the
                same size as first one and than concat together.
                Usually used in FCN head of HRNet.
            'multiple_select': Multiple feature maps will be bundle into
                a list and passed into decode head.
            None: Only one select feature map is allowed.
            Default: None.
        loss_decode (dict): Config of decode loss.
            Default: dict(type='CrossEntropyLoss').
        ignore_index (int | None): The label index to be ignored. When using
            masked BCE loss, ignore_index should be set to None. Default: 255
        sampler (dict|None): The config of segmentation map sampler.
            Default: None.
        align_corners (bool): align_corners argument of F.interpolate.
            Default: False.
    """

    def __init__(self,
                 in_channels,
                 channels,
                 *,
                 num_classes,
                 dropout_ratio=0.1,
                 conv_cfg=None,
                 norm_cfg=None,
                 act_cfg=dict(type='ReLU'),
                 in_index=-1,
                 input_transform=None,
                 loss_decode=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0),
                 ignore_index=255,
                 sampler=None,
                 align_corners=False):
        super(BaseDecodeHead, self).__init__()
        self._init_inputs(in_channels, in_index, input_transform)
        self.channels = channels
        self.num_classes = num_classes
        self.dropout_ratio = dropout_ratio
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.in_index = in_index
        # self.loss_decode = build_loss(loss_decode)
        self.ignore_index = ignore_index
        self.align_corners = align_corners
        # if sampler is not None:
        #     self.sampler = build_pixel_sampler(sampler, context=self)
        # else:
        #     self.sampler = None

        # self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
        # self.conv1 = nn.Conv2d(channels, num_classes, 3, padding=1)
        if dropout_ratio > 0:
            self.dropout = nn.Dropout2d(dropout_ratio)
        else:
            self.dropout = None
        self.fp16_enabled = False

    def extra_repr(self):
        """Extra repr."""
        s = f'input_transform={self.input_transform}, ' \
            f'ignore_index={self.ignore_index}, ' \
            f'align_corners={self.align_corners}'
        return s

    def _init_inputs(self, in_channels, in_index, input_transform):
        """Check and initialize input transforms.

        The in_channels, in_index and input_transform must match.
        Specifically, when input_transform is None, only single feature map
        will be selected. So in_channels and in_index must be of type int.
        When input_transform

        Args:
            in_channels (int|Sequence[int]): Input channels.
            in_index (int|Sequence[int]): Input feature index.
            input_transform (str|None): Transformation type of input features.
                Options: 'resize_concat', 'multiple_select', None.
                'resize_concat': Multiple feature maps will be resize to the
                    same size as first one and than concat together.
                    Usually used in FCN head of HRNet.
                'multiple_select': Multiple feature maps will be bundle into
                    a list and passed into decode head.
                None: Only one select feature map is allowed.
        """

        if input_transform is not None:
            assert input_transform in ['resize_concat', 'multiple_select']
        self.input_transform = input_transform
        self.in_index = in_index
        if input_transform is not None:
            assert isinstance(in_channels, (list, tuple))
            assert isinstance(in_index, (list, tuple))
            assert len(in_channels) == len(in_index)
            if input_transform == 'resize_concat':
                self.in_channels = sum(in_channels)
            else:
                self.in_channels = in_channels
        else:
            assert isinstance(in_channels, int)
            assert isinstance(in_index, int)
            self.in_channels = in_channels

    def init_weights(self):
        """Initialize weights of classification layer."""
        # normal_init(self.conv_seg, mean=0, std=0.01)
        # normal_init(self.conv1, mean=0, std=0.01)

    def _transform_inputs(self, inputs):
        """Transform inputs for decoder.

        Args:
            inputs (list[Tensor]): List of multi-level img features.

        Returns:
            Tensor: The transformed inputs
        """

        if self.input_transform == 'resize_concat':
            inputs = [inputs[i] for i in self.in_index]
            upsampled_inputs = [
                resize(
                    input=x,
                    size=inputs[0].shape[2:],
                    mode='bilinear',
                    align_corners=self.align_corners) for x in inputs
            ]
            inputs = torch.cat(upsampled_inputs, dim=1)
        elif self.input_transform == 'multiple_select':
            inputs = [inputs[i] for i in self.in_index]
        else:
            inputs = inputs[self.in_index]

        return inputs

    def forward(self, inputs):
        """Placeholder of forward function."""
        pass

    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
        """Forward function for training.
        Args:
            inputs (list[Tensor]): List of multi-level img features.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            gt_semantic_seg (Tensor): Semantic segmentation masks
                used if the architecture supports semantic segmentation task.
            train_cfg (dict): The training config.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        seg_logits = self.forward(inputs)
        losses = self.losses(seg_logits, gt_semantic_seg)
        return losses

    def forward_test(self, inputs, img_metas, test_cfg):
        """Forward function for testing.

        Args:
            inputs (list[Tensor]): List of multi-level img features.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            test_cfg (dict): The testing config.

        Returns:
            Tensor: Output segmentation map.
        """
        return self.forward(inputs)


 class UPerHead(BaseDecodeHead):

    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
        super(UPerHead, self).__init__(
            input_transform='multiple_select', **kwargs)
        # FPN Module
        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()
        for in_channels in self.in_channels:  # skip the top layer
            l_conv = ConvModule(
                in_channels,
                self.channels,
                1,
                conv_cfg=self.conv_cfg,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg,
                inplace=True)
            fpn_conv = ConvModule(
                self.channels,
                self.channels,
                3,
                padding=1,
                conv_cfg=self.conv_cfg,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg,
                inplace=True)
            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

    def forward(self, inputs):
        """Forward function."""

        inputs = self._transform_inputs(inputs)

        # build laterals
        laterals = [
            lateral_conv(inputs[i])
            for i, lateral_conv in enumerate(self.lateral_convs)
        ]

        # laterals.append(self.psp_forward(inputs))

        # build top-down path
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            prev_shape = laterals[i - 1].shape[2:]
            laterals[i - 1] += resize(
                laterals[i],
                size=prev_shape,
                mode='bilinear',
                align_corners=self.align_corners)

        # build outputs
        fpn_outs = [
            self.fpn_convs[i](laterals[i])
            for i in range(used_backbone_levels - 1)
        ]
        # append psp feature
        fpn_outs.append(laterals[-1])

        return fpn_outs[0]


 class PSP(BaseDecodeHead):
    """Unified Perceptual Parsing for Scene Understanding.

    This head is the implementation of `UPerNet
    <https://arxiv.org/abs/1807.10221>`_.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module applied on the last feature. Default: (1, 2, 3, 6).
    """

    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
        super(PSP, self).__init__(input_transform='multiple_select', **kwargs)
        # PSP Module
        self.psp_modules = PPM(
            pool_scales,
            self.in_channels[-1],
            self.channels,
            conv_cfg=self.conv_cfg,
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg,
            align_corners=self.align_corners)
        self.bottleneck = ConvModule(
            self.in_channels[-1] + len(pool_scales) * self.channels,
            self.channels,
            3,
            padding=1,
            conv_cfg=self.conv_cfg,
            norm_cfg=self.norm_cfg,
            act_cfg=self.act_cfg)

    def psp_forward(self, inputs):
        """Forward function of PSP module."""
        x = inputs[-1]
        psp_outs = [x]
        psp_outs.extend(self.psp_modules(x))
        psp_outs = torch.cat(psp_outs, dim=1)
        output = self.bottleneck(psp_outs)

        return output

    def forward(self, inputs):
        """Forward function."""
        inputs = self._transform_inputs(inputs)

        return self.psp_forward(inputs)
--- a/modelscope/models/cv/image_depth_estimation/newcrfs_model.py
+++ b/modelscope/models/cv/image_depth_estimation/newcrfs_model.py
@@ -0,0 +1,53 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp

 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.image_depth_estimation.networks.newcrf_depth import \
    NewCRFDepth
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
    Tasks.image_depth_estimation, module_name=Models.newcrfs_depth_estimation)
 class DepthEstimation(TorchModel):

    def __init__(self, model_dir: str, **kwargs):
        """str -- model file root."""
        super().__init__(model_dir, **kwargs)

        # build model
        self.model = NewCRFDepth(
            version='large07', inv_depth=False, max_depth=10)

        # load model
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        checkpoint = torch.load(model_path)

        state_dict = {}
        for k in checkpoint['model'].keys():
            if k.startswith('module.'):
                state_dict[k[7:]] = checkpoint['model'][k]
            else:
                state_dict[k] = checkpoint['model'][k]
        self.model.load_state_dict(state_dict)
        self.model.eval()

    def forward(self, Inputs):
        return self.model(Inputs['imgs'])

    def postprocess(self, Inputs):
        depth_result = Inputs

        results = {OutputKeys.DEPTHS: depth_result}
        return results

    def inference(self, data):
        results = self.forward(data)

        return results
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -25,7 +25,14 @@ def seg_resize(input,
                        'the output would more aligned if '
                        f'input size {(input_h, input_w)} is `x+1` and '
                        f'out size {(output_h, output_w)} is `nx+1`')
    return F.interpolate(input, size, scale_factor, mode, align_corners)

    try:
        return F.interpolate(input, size, scale_factor, mode, align_corners)
    except ValueError:
        if isinstance(size, tuple):
            if len(size) == 3:
                size = size[:2]
        return F.interpolate(input, size, scale_factor, mode, align_corners)


 def add_prefix(inputs, prefix):
--- a/modelscope/models/cv/salient_detection/models/init.py
+++ b/modelscope/models/cv/salient_detection/models/init.py
@@ -1,3 +1,4 @@
 # The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
 # source code avaiable via https://github.com/xuebinqin/U-2-Net
 from .senet import SENet
 from .u2net import U2NET
--- a/modelscope/models/cv/salient_detection/models/backbone/Res2Net_v1b.py
+++ b/modelscope/models/cv/salient_detection/models/backbone/Res2Net_v1b.py
@@ -0,0 +1,187 @@
 # Implementation in this file is modified based on Res2Net-PretrainedModels
 # Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
 # publicly avaialbe at https://github.com/Res2Net/Res2Net-PretrainedModels/blob/master/res2net_v1b.py
 import math

 import torch
 import torch.nn as nn

 __all__ = ['Res2Net', 'res2net50_v1b_26w_4s']


 class Bottle2neck(nn.Module):
    expansion = 4

    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 downsample=None,
                 baseWidth=26,
                 scale=4,
                 stype='normal'):
        """ Constructor
        Args:
            inplanes: input channel dimensionality
            planes: output channel dimensionality
            stride: conv stride. Replaces pooling layer.
            downsample: None when stride = 1
            baseWidth: basic width of conv3x3
            scale: number of scale.
            type: 'normal': normal set. 'stage': first block of a new stage.
        """
        super(Bottle2neck, self).__init__()
        width = int(math.floor(planes * (baseWidth / 64.0)))
        self.conv1 = nn.Conv2d(
            inplanes, width * scale, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width * scale)
        if scale == 1:
            self.nums = 1
        else:
            self.nums = scale - 1
        if stype == 'stage':
            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
        convs = []
        bns = []
        for i in range(self.nums):
            convs.append(
                nn.Conv2d(
                    width,
                    width,
                    kernel_size=3,
                    stride=stride,
                    padding=1,
                    bias=False))
            bns.append(nn.BatchNorm2d(width))
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)
        self.conv3 = nn.Conv2d(
            width * scale, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stype = stype
        self.scale = scale
        self.width = width

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
            if i == 0 or self.stype == 'stage':
                sp = spx[i]
            else:
                sp = sp + spx[i]
            sp = self.convs[i](sp)
            sp = self.relu(self.bns[i](sp))
            if i == 0:
                out = sp
            else:
                out = torch.cat((out, sp), 1)
        if self.scale != 1 and self.stype == 'normal':
            out = torch.cat((out, spx[self.nums]), 1)
        elif self.scale != 1 and self.stype == 'stage':
            out = torch.cat((out, self.pool(spx[self.nums])), 1)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


 class Res2Net(nn.Module):

    def __init__(self, block, layers, baseWidth=26, scale=4, num_classes=1000):
        self.inplanes = 64
        super(Res2Net, self).__init__()
        self.baseWidth = baseWidth
        self.scale = scale
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, 3, 2, 1, bias=False), nn.BatchNorm2d(32),
            nn.ReLU(inplace=True), nn.Conv2d(32, 32, 3, 1, 1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, 3, 1, 1, bias=False))
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.AvgPool2d(
                    kernel_size=stride,
                    stride=stride,
                    ceil_mode=True,
                    count_include_pad=False),
                nn.Conv2d(
                    self.inplanes,
                    planes * block.expansion,
                    kernel_size=1,
                    stride=1,
                    bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(
            block(
                self.inplanes,
                planes,
                stride,
                downsample=downsample,
                stype='stage',
                baseWidth=self.baseWidth,
                scale=self.scale))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    baseWidth=self.baseWidth,
                    scale=self.scale))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


 def res2net50_v1b_26w_4s(backbone_path, pretrained=False, **kwargs):
    """Constructs a Res2Net-50_v1b_26w_4s lib.
    Args:
        pretrained (bool): If True, returns a lib pre-trained on ImageNet
    """
    model = Res2Net(Bottle2neck, [3, 4, 6, 3], baseWidth=26, scale=4, **kwargs)
    if pretrained:
        model_state = torch.load(backbone_path)
        model.load_state_dict(model_state)
    return model
--- a/modelscope/models/cv/salient_detection/models/backbone/init.py
+++ b/modelscope/models/cv/salient_detection/models/backbone/init.py
@@ -0,0 +1,6 @@
 # Implementation in this file is modified based on Res2Net-PretrainedModels
 # Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
 # publicly avaialbe at https://github.com/Res2Net/Res2Net-PretrainedModels/blob/master/res2net_v1b.py
 from .Res2Net_v1b import res2net50_v1b_26w_4s

 __all__ = ['res2net50_v1b_26w_4s']
--- a/modelscope/models/cv/salient_detection/models/modules.py
+++ b/modelscope/models/cv/salient_detection/models/modules.py
@@ -0,0 +1,178 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .utils import ConvBNReLU


 class AreaLayer(nn.Module):

    def __init__(self, in_channel, out_channel):
        super(AreaLayer, self).__init__()
        self.lbody = nn.Sequential(
            nn.Conv2d(out_channel, out_channel, 1),
            nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True))
        self.hbody = nn.Sequential(
            nn.Conv2d(in_channel, out_channel, 1), nn.BatchNorm2d(out_channel),
            nn.ReLU(inplace=True))
        self.body = nn.Sequential(
            nn.Conv2d(2 * out_channel, out_channel, 3, 1, 1),
            nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
            nn.Conv2d(out_channel, out_channel, 3, 1, 1),
            nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
            nn.Conv2d(out_channel, 1, 1))

    def forward(self, xl, xh):
        xl1 = self.lbody(xl)
        xl1 = F.interpolate(
            xl1, size=xh.size()[2:], mode='bilinear', align_corners=True)
        xh1 = self.hbody(xh)
        x = torch.cat((xl1, xh1), dim=1)
        x_out = self.body(x)
        return x_out


 class EdgeLayer(nn.Module):

    def __init__(self, in_channel, out_channel):
        super(EdgeLayer, self).__init__()
        self.lbody = nn.Sequential(
            nn.Conv2d(out_channel, out_channel, 1),
            nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True))
        self.hbody = nn.Sequential(
            nn.Conv2d(in_channel, out_channel, 1), nn.BatchNorm2d(out_channel),
            nn.ReLU(inplace=True))
        self.bodye = nn.Sequential(
            nn.Conv2d(2 * out_channel, out_channel, 3, 1, 1),
            nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
            nn.Conv2d(out_channel, out_channel, 3, 1, 1),
            nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
            nn.Conv2d(out_channel, 1, 1))

    def forward(self, xl, xh):
        xl1 = self.lbody(xl)
        xh1 = self.hbody(xh)
        xh1 = F.interpolate(
            xh1, size=xl.size()[2:], mode='bilinear', align_corners=True)
        x = torch.cat((xl1, xh1), dim=1)
        x_out = self.bodye(x)
        return x_out


 class EBlock(nn.Module):

    def __init__(self, inchs, outchs):
        super(EBlock, self).__init__()
        self.elayer = nn.Sequential(
            ConvBNReLU(inchs + 1, outchs, kernel_size=3, padding=1, stride=1),
            ConvBNReLU(outchs, outchs, 1))
        self.salayer = nn.Sequential(
            nn.Conv2d(2, 1, 3, 1, 1, bias=False),
            nn.BatchNorm2d(1, momentum=0.01), nn.Sigmoid())

    def forward(self, x, edgeAtten):
        x = torch.cat((x, edgeAtten), dim=1)
        ex = self.elayer(x)
        ex_max = torch.max(ex, 1, keepdim=True)[0]
        ex_mean = torch.mean(ex, dim=1, keepdim=True)
        xei_compress = torch.cat((ex_max, ex_mean), dim=1)

        scale = self.salayer(xei_compress)
        x_out = ex * scale
        return x_out


 class StructureE(nn.Module):

    def __init__(self, inchs, outchs, EM):
        super(StructureE, self).__init__()
        self.ne_modules = int(inchs / EM)
        NM = int(outchs / self.ne_modules)
        elayes = []
        for i in range(self.ne_modules):
            emblock = EBlock(EM, NM)
            elayes.append(emblock)
        self.emlayes = nn.ModuleList(elayes)
        self.body = nn.Sequential(
            ConvBNReLU(outchs, outchs, 3, 1, 1), ConvBNReLU(outchs, outchs, 1))

    def forward(self, x, edgeAtten):
        if edgeAtten.size() != x.size():
            edgeAtten = F.interpolate(
                edgeAtten, x.size()[2:], mode='bilinear', align_corners=False)
        xx = torch.chunk(x, self.ne_modules, dim=1)
        efeas = []
        for i in range(self.ne_modules):
            xei = self.emlayes[i](xx[i], edgeAtten)
            efeas.append(xei)
        efeas = torch.cat(efeas, dim=1)
        x_out = self.body(efeas)
        return x_out


 class ABlock(nn.Module):

    def __init__(self, inchs, outchs, k):
        super(ABlock, self).__init__()
        self.alayer = nn.Sequential(
            ConvBNReLU(inchs, outchs, k, 1, k // 2),
            ConvBNReLU(outchs, outchs, 1))
        self.arlayer = nn.Sequential(
            ConvBNReLU(inchs, outchs, k, 1, k // 2),
            ConvBNReLU(outchs, outchs, 1))
        self.fusion = ConvBNReLU(2 * outchs, outchs, 1)

    def forward(self, x, areaAtten):
        xa = x * areaAtten
        xra = x * (1 - areaAtten)
        xout = self.fusion(torch.cat((xa, xra), dim=1))
        return xout


 class AMFusion(nn.Module):

    def __init__(self, inchs, outchs, AM):
        super(AMFusion, self).__init__()
        self.k = [3, 3, 5, 5]
        self.conv_up = ConvBNReLU(inchs, outchs, 3, 1, 1)
        self.up = nn.Upsample(
            scale_factor=2, mode='bilinear', align_corners=True)
        self.na_modules = int(outchs / AM)
        alayers = []
        for i in range(self.na_modules):
            layer = ABlock(AM, AM, self.k[i])
            alayers.append(layer)
        self.alayers = nn.ModuleList(alayers)
        self.fusion_0 = ConvBNReLU(outchs, outchs, 3, 1, 1)
        self.fusion_e = nn.Sequential(
            nn.Conv2d(
                outchs, outchs, kernel_size=(3, 1), padding=(1, 0),
                bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True),
            nn.Conv2d(
                outchs, outchs, kernel_size=(1, 3), padding=(0, 1),
                bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True))
        self.fusion_e1 = nn.Sequential(
            nn.Conv2d(
                outchs, outchs, kernel_size=(5, 1), padding=(2, 0),
                bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True),
            nn.Conv2d(
                outchs, outchs, kernel_size=(1, 5), padding=(0, 2),
                bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True))
        self.fusion = ConvBNReLU(3 * outchs, outchs, 1)

    def forward(self, xl, xh, xhm):
        xh1 = self.up(self.conv_up(xh))
        x = xh1 + xl
        xm = self.up(torch.sigmoid(xhm))
        xx = torch.chunk(x, self.na_modules, dim=1)
        xxmids = []
        for i in range(self.na_modules):
            xi = self.alayers[i](xx[i], xm)
            xxmids.append(xi)
        xfea = torch.cat(xxmids, dim=1)
        x0 = self.fusion_0(xfea)
        x1 = self.fusion_e(xfea)
        x2 = self.fusion_e1(xfea)
        x_out = self.fusion(torch.cat((x0, x1, x2), dim=1))
        return x_out
--- a/modelscope/models/cv/salient_detection/models/senet.py
+++ b/modelscope/models/cv/salient_detection/models/senet.py
@@ -0,0 +1,74 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .backbone import res2net50_v1b_26w_4s as res2net
 from .modules import AMFusion, AreaLayer, EdgeLayer, StructureE
 from .utils import ASPP, CBAM, ConvBNReLU


 class SENet(nn.Module):

    def __init__(self, backbone_path=None, pretrained=False):
        super(SENet, self).__init__()
        resnet50 = res2net(backbone_path, pretrained)
        self.layer0_1 = nn.Sequential(resnet50.conv1, resnet50.bn1,
                                      resnet50.relu)
        self.maxpool = resnet50.maxpool
        self.layer1 = resnet50.layer1
        self.layer2 = resnet50.layer2
        self.layer3 = resnet50.layer3
        self.layer4 = resnet50.layer4
        self.aspp3 = ASPP(1024, 256)
        self.aspp4 = ASPP(2048, 256)
        self.cbblock3 = CBAM(inchs=256, kernel_size=5)
        self.cbblock4 = CBAM(inchs=256, kernel_size=5)
        self.up = nn.Upsample(
            mode='bilinear', scale_factor=2, align_corners=False)
        self.conv_up = ConvBNReLU(512, 512, 1)
        self.aux_edge = EdgeLayer(512, 256)
        self.aux_area = AreaLayer(512, 256)
        self.layer1_enhance = StructureE(256, 128, 128)
        self.layer2_enhance = StructureE(512, 256, 128)
        self.layer3_decoder = AMFusion(512, 256, 128)
        self.layer2_decoder = AMFusion(256, 128, 128)
        self.out_conv_8 = nn.Conv2d(256, 1, 1)
        self.out_conv_4 = nn.Conv2d(128, 1, 1)

    def forward(self, x):
        layer0 = self.layer0_1(x)
        layer0s = self.maxpool(layer0)
        layer1 = self.layer1(layer0s)
        layer2 = self.layer2(layer1)
        layer3 = self.layer3(layer2)
        layer4 = self.layer4(layer3)
        layer3_eh = self.cbblock3(self.aspp3(layer3))
        layer4_eh = self.cbblock4(self.aspp4(layer4))
        layer34 = self.conv_up(
            torch.cat((self.up(layer4_eh), layer3_eh), dim=1))
        edge_atten = self.aux_edge(layer1, layer34)
        area_atten = self.aux_area(layer1, layer34)
        edge_atten_ = torch.sigmoid(edge_atten)
        layer1_eh = self.layer1_enhance(layer1, edge_atten_)
        layer2_eh = self.layer2_enhance(layer2, edge_atten_)
        layer2_fu = self.layer3_decoder(layer2_eh, layer34, area_atten)
        out_8 = self.out_conv_8(layer2_fu)
        layer1_fu = self.layer2_decoder(layer1_eh, layer2_fu, out_8)
        out_4 = self.out_conv_4(layer1_fu)
        out_16 = F.interpolate(
            area_atten,
            size=x.size()[2:],
            mode='bilinear',
            align_corners=False)
        out_8 = F.interpolate(
            out_8, size=x.size()[2:], mode='bilinear', align_corners=False)
        out_4 = F.interpolate(
            out_4, size=x.size()[2:], mode='bilinear', align_corners=False)
        edge_out = F.interpolate(
            edge_atten_,
            size=x.size()[2:],
            mode='bilinear',
            align_corners=False)

        return out_4.sigmoid(), out_8.sigmoid(), out_16.sigmoid(), edge_out
--- a/modelscope/models/cv/salient_detection/models/utils.py
+++ b/modelscope/models/cv/salient_detection/models/utils.py
@@ -0,0 +1,105 @@
 # Implementation in this file is modified based on deeplabv3
 # Originally MIT license,publicly avaialbe at https://github.com/fregu856/deeplabv3/blob/master/model/aspp.py
 # Implementation in this file is modified based on attention-module
 # Originally MIT license,publicly avaialbe at https://github.com/Jongchan/attention-module/blob/master/MODELS/cbam.py
 import torch
 import torch.nn as nn


 class ConvBNReLU(nn.Module):

    def __init__(self,
                 inplanes,
                 planes,
                 kernel_size=3,
                 stride=1,
                 padding=0,
                 dilation=1,
                 bias=False):
        super(ConvBNReLU, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(
                inplanes,
                planes,
                kernel_size,
                stride=stride,
                padding=padding,
                dilation=dilation,
                bias=bias), nn.BatchNorm2d(planes), nn.ReLU(inplace=True))

    def forward(self, x):
        return self.block(x)


 class ASPP(nn.Module):

    def __init__(self, in_dim, out_dim):
        super(ASPP, self).__init__()
        mid_dim = 128
        self.conv1 = ConvBNReLU(in_dim, mid_dim, kernel_size=1)
        self.conv2 = ConvBNReLU(
            in_dim, mid_dim, kernel_size=3, padding=2, dilation=2)
        self.conv3 = ConvBNReLU(
            in_dim, mid_dim, kernel_size=3, padding=5, dilation=5)
        self.conv4 = ConvBNReLU(
            in_dim, mid_dim, kernel_size=3, padding=7, dilation=7)
        self.conv5 = ConvBNReLU(in_dim, mid_dim, kernel_size=1, padding=0)
        self.fuse = ConvBNReLU(5 * mid_dim, out_dim, 3, 1, 1)
        self.global_pooling = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        conv1 = self.conv1(x)
        conv2 = self.conv2(x)
        conv3 = self.conv3(x)
        conv4 = self.conv4(x)
        xg = self.conv5(self.global_pooling(x))
        conv5 = nn.Upsample((x.shape[2], x.shape[3]), mode='nearest')(xg)
        return self.fuse(torch.cat((conv1, conv2, conv3, conv4, conv5), 1))


 class ChannelAttention(nn.Module):

    def __init__(self, inchs, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.fc = nn.Sequential(
            nn.Conv2d(inchs, inchs // 16, 1, bias=False), nn.ReLU(),
            nn.Conv2d(inchs // 16, inchs, 1, bias=False))
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return self.sigmoid(out)


 class SpatialAttention(nn.Module):

    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        self.conv1 = nn.Conv2d(
            2, 1, kernel_size, padding=kernel_size // 2, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)


 class CBAM(nn.Module):

    def __init__(self, inchs, kernel_size=7):
        super().__init__()
        self.calayer = ChannelAttention(inchs=inchs)
        self.saLayer = SpatialAttention(kernel_size=kernel_size)

    def forward(self, x):
        xca = self.calayer(x) * x
        xsa = self.saLayer(xca) * xca
        return xsa
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -2,7 +2,6 @@
 import os.path as osp

 import cv2
 import numpy as np
 import torch
 from PIL import Image
 from torchvision import transforms
@@ -10,8 +9,9 @@ from torchvision import transforms
 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .models import U2NET
 from .models import U2NET, SENet


@MODELS.register_module(
@@ -22,13 +22,25 @@ class SalientDetection(TorchModel):
        """str -- model file root."""
        super().__init__(model_dir, *args, **kwargs)
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        self.model = U2NET(3, 1)

        self.norm_mean = [0.485, 0.456, 0.406]
        self.norm_std = [0.229, 0.224, 0.225]
        self.norm_size = (320, 320)

        config_path = osp.join(model_dir, 'config.py')
        if osp.exists(config_path) is False:
            self.model = U2NET(3, 1)
        else:
            self.model = SENet(backbone_path=None, pretrained=False)
            config = Config.from_file(config_path)
            self.norm_mean = config.norm_mean
            self.norm_std = config.norm_std
            self.norm_size = config.norm_size
        checkpoint = torch.load(model_path, map_location='cpu')
        self.transform_input = transforms.Compose([
            transforms.Resize((320, 320)),
            transforms.Resize(self.norm_size),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            transforms.Normalize(mean=self.norm_mean, std=self.norm_std)
        ])
        self.model.load_state_dict(checkpoint)
        self.model.eval()
--- a/modelscope/models/cv/tinynas_detection/init.py
+++ b/modelscope/models/cv/tinynas_detection/init.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 from typing import TYPE_CHECKING

--- a/modelscope/models/cv/tinynas_detection/backbone/init.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/init.py
@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import copy

 from .darknet import CSPDarknet
 from .tinynas import load_tinynas_net
 from .tinynas_csp import load_tinynas_net as load_tinynas_net_csp
 from .tinynas_res import load_tinynas_net as load_tinynas_net_res


 def build_backbone(cfg):
@@ -12,5 +13,7 @@ def build_backbone(cfg):
    name = backbone_cfg.pop('name')
    if name == 'CSPDarknet':
        return CSPDarknet(**backbone_cfg)
    elif name == 'TinyNAS':
        return load_tinynas_net(backbone_cfg)
    elif name == 'TinyNAS_csp':
        return load_tinynas_net_csp(backbone_cfg)
    elif name == 'TinyNAS_res':
        return load_tinynas_net_res(backbone_cfg)
--- a/modelscope/models/cv/tinynas_detection/backbone/darknet.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
@@ -1,12 +1,11 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import torch
 from torch import nn

 from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
                             SPPBottleneck)
 from modelscope.models.cv.tinynas_detection.core.base_ops import (
    BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck)


 class CSPDarknet(nn.Module):
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
@@ -1,359 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import torch
 import torch.nn as nn

 from modelscope.utils.file_utils import read_file
 from ..core.base_ops import Focus, SPPBottleneck, get_activation
 from ..core.repvgg_block import RepVggBlock


 class ConvKXBN(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride):
        super(ConvKXBN, self).__init__()
        self.conv1 = nn.Conv2d(
            in_c,
            out_c,
            kernel_size,
            stride, (kernel_size - 1) // 2,
            groups=1,
            bias=False)
        self.bn1 = nn.BatchNorm2d(out_c)

    def forward(self, x):
        return self.bn1(self.conv1(x))


 class ConvKXBNRELU(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
        super(ConvKXBNRELU, self).__init__()
        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
        if act is None:
            self.activation_function = torch.relu
        else:
            self.activation_function = get_activation(act)

    def forward(self, x):
        output = self.conv(x)
        return self.activation_function(output)


 class ResConvK1KX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 force_resproj=False,
                 act='silu',
                 reparam=False):
        super(ResConvK1KX, self).__init__()
        self.stride = stride
        self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
        if not reparam:
            self.conv2 = ConvKXBN(btn_c, out_c, 3, stride)
        else:
            self.conv2 = RepVggBlock(
                btn_c, out_c, kernel_size, stride, act='identity')

        if act is None:
            self.activation_function = torch.relu
        else:
            self.activation_function = get_activation(act)

        if stride == 2:
            self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
        else:
            self.residual_downsample = nn.Identity()

        if in_c != out_c or force_resproj:
            self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
        else:
            self.residual_proj = nn.Identity()

    def forward(self, x):
        if self.stride != 2:
            reslink = self.residual_downsample(x)
            reslink = self.residual_proj(reslink)

        output = x
        output = self.conv1(output)
        output = self.activation_function(output)
        output = self.conv2(output)
        if self.stride != 2:
            output = output + reslink
        output = self.activation_function(output)

        return output


 class SuperResConvK1KX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 num_blocks,
                 with_spp=False,
                 act='silu',
                 reparam=False):
        super(SuperResConvK1KX, self).__init__()
        if act is None:
            self.act = torch.relu
        else:
            self.act = get_activation(act)
        self.block_list = nn.ModuleList()
        for block_id in range(num_blocks):
            if block_id == 0:
                in_channels = in_c
                out_channels = out_c
                this_stride = stride
                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
                this_kernel_size = kernel_size
            else:
                in_channels = out_c
                out_channels = out_c
                this_stride = 1
                force_resproj = False
                this_kernel_size = kernel_size
            the_block = ResConvK1KX(
                in_channels,
                out_channels,
                btn_c,
                this_kernel_size,
                this_stride,
                force_resproj,
                act=act,
                reparam=reparam)
            self.block_list.append(the_block)
            if block_id == 0 and with_spp:
                self.block_list.append(
                    SPPBottleneck(out_channels, out_channels))

    def forward(self, x):
        output = x
        for block in self.block_list:
            output = block(output)
        return output


 class ResConvKXKX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 force_resproj=False,
                 act='silu'):
        super(ResConvKXKX, self).__init__()
        self.stride = stride
        if self.stride == 2:
            self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
        else:
            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
            self.conv2 = RepVggBlock(
                btn_c, out_c, kernel_size, stride, act='identity')

            if act is None:
                self.activation_function = torch.relu
            else:
                self.activation_function = get_activation(act)

            if stride == 2:
                self.residual_downsample = nn.AvgPool2d(
                    kernel_size=2, stride=2)
            else:
                self.residual_downsample = nn.Identity()

            if in_c != out_c or force_resproj:
                self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
            else:
                self.residual_proj = nn.Identity()

    def forward(self, x):
        if self.stride == 2:
            return self.downsampler(x)
        reslink = self.residual_downsample(x)
        reslink = self.residual_proj(reslink)

        output = x
        output = self.conv1(output)
        output = self.activation_function(output)
        output = self.conv2(output)

        output = output + reslink
        output = self.activation_function(output)

        return output


 class SuperResConvKXKX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 num_blocks,
                 with_spp=False,
                 act='silu'):
        super(SuperResConvKXKX, self).__init__()
        if act is None:
            self.act = torch.relu
        else:
            self.act = get_activation(act)
        self.block_list = nn.ModuleList()
        for block_id in range(num_blocks):
            if block_id == 0:
                in_channels = in_c
                out_channels = out_c
                this_stride = stride
                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
                this_kernel_size = kernel_size
            else:
                in_channels = out_c
                out_channels = out_c
                this_stride = 1
                force_resproj = False
                this_kernel_size = kernel_size
            the_block = ResConvKXKX(
                in_channels,
                out_channels,
                btn_c,
                this_kernel_size,
                this_stride,
                force_resproj,
                act=act)
            self.block_list.append(the_block)
            if block_id == 0 and with_spp:
                self.block_list.append(
                    SPPBottleneck(out_channels, out_channels))

    def forward(self, x):
        output = x
        for block in self.block_list:
            output = block(output)
        return output


 class TinyNAS(nn.Module):

    def __init__(self,
                 structure_info=None,
                 out_indices=[0, 1, 2, 4, 5],
                 out_channels=[None, None, 128, 256, 512],
                 with_spp=False,
                 use_focus=False,
                 need_conv1=True,
                 act='silu',
                 reparam=False):
        super(TinyNAS, self).__init__()
        assert len(out_indices) == len(out_channels)
        self.out_indices = out_indices
        self.need_conv1 = need_conv1

        self.block_list = nn.ModuleList()
        if need_conv1:
            self.conv1_list = nn.ModuleList()
        for idx, block_info in enumerate(structure_info):
            the_block_class = block_info['class']
            if the_block_class == 'ConvKXBNRELU':
                if use_focus:
                    the_block = Focus(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        act=act)
                else:
                    the_block = ConvKXBNRELU(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        block_info['s'],
                        act=act)
                self.block_list.append(the_block)
            elif the_block_class == 'SuperResConvK1KX':
                spp = with_spp if idx == len(structure_info) - 1 else False
                the_block = SuperResConvK1KX(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['k'],
                    block_info['s'],
                    block_info['L'],
                    spp,
                    act=act,
                    reparam=reparam)
                self.block_list.append(the_block)
            elif the_block_class == 'SuperResConvKXKX':
                spp = with_spp if idx == len(structure_info) - 1 else False
                the_block = SuperResConvKXKX(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['k'],
                    block_info['s'],
                    block_info['L'],
                    spp,
                    act=act)
                self.block_list.append(the_block)
            if need_conv1:
                if idx in self.out_indices and out_channels[
                        self.out_indices.index(idx)] is not None:
                    self.conv1_list.append(
                        nn.Conv2d(block_info['out'],
                                  out_channels[self.out_indices.index(idx)],
                                  1))
                else:
                    self.conv1_list.append(None)

    def init_weights(self, pretrain=None):
        pass

    def forward(self, x):
        output = x
        stage_feature_list = []
        for idx, block in enumerate(self.block_list):
            output = block(output)
            if idx in self.out_indices:
                if self.need_conv1 and self.conv1_list[idx] is not None:
                    true_out = self.conv1_list[idx](output)
                    stage_feature_list.append(true_out)
                else:
                    stage_feature_list.append(output)
        return stage_feature_list


 def load_tinynas_net(backbone_cfg):
    # load masternet model to path
    import ast
    net_structure_str = read_file(backbone_cfg.structure_file)
    struct_str = ''.join([x.strip() for x in net_structure_str])
    struct_info = ast.literal_eval(struct_str)
    for layer in struct_info:
        if 'nbitsA' in layer:
            del layer['nbitsA']
        if 'nbitsW' in layer:
            del layer['nbitsW']

    model = TinyNAS(
        structure_info=struct_info,
        out_indices=backbone_cfg.out_indices,
        out_channels=backbone_cfg.out_channels,
        with_spp=backbone_cfg.with_spp,
        use_focus=backbone_cfg.use_focus,
        act=backbone_cfg.act,
        need_conv1=backbone_cfg.need_conv1,
        reparam=backbone_cfg.reparam)

    return model
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py
@@ -0,0 +1,295 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The DAMO-YOLO implementation is also open-sourced by the authors, and available
 # at https://github.com/tinyvision/damo-yolo.

 import torch
 import torch.nn as nn

 from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv,
                                                             SPPBottleneck,
                                                             get_activation)
 from modelscope.utils.file_utils import read_file


 class ConvKXBN(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride):
        super(ConvKXBN, self).__init__()
        self.conv1 = nn.Conv2d(
            in_c,
            out_c,
            kernel_size,
            stride, (kernel_size - 1) // 2,
            groups=1,
            bias=False)
        self.bn1 = nn.BatchNorm2d(out_c)

    def forward(self, x):
        return self.bn1(self.conv1(x))


 class ConvKXBNRELU(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
        super(ConvKXBNRELU, self).__init__()
        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
        if act is None:
            self.activation_function = torch.relu
        else:
            self.activation_function = get_activation(act)

    def forward(self, x):
        output = self.conv(x)
        return self.activation_function(output)


 class ResConvBlock(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 act='silu',
                 reparam=False,
                 block_type='k1kx'):
        super(ResConvBlock, self).__init__()
        self.stride = stride
        if block_type == 'k1kx':
            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size=1, stride=1)
        else:
            self.conv1 = ConvKXBN(
                in_c, btn_c, kernel_size=kernel_size, stride=1)
        if not reparam:
            self.conv2 = ConvKXBN(btn_c, out_c, kernel_size, stride)
        else:
            self.conv2 = RepConv(
                btn_c, out_c, kernel_size, stride, act='identity')

        self.activation_function = get_activation(act)

        if in_c != out_c and stride != 2:
            self.residual_proj = ConvKXBN(in_c, out_c, kernel_size=1, stride=1)
        else:
            self.residual_proj = None

    def forward(self, x):
        if self.residual_proj is not None:
            reslink = self.residual_proj(x)
        else:
            reslink = x
        x = self.conv1(x)
        x = self.activation_function(x)
        x = self.conv2(x)
        if self.stride != 2:
            x = x + reslink
        x = self.activation_function(x)
        return x


 class CSPStem(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 stride,
                 kernel_size,
                 num_blocks,
                 act='silu',
                 reparam=False,
                 block_type='k1kx'):
        super(CSPStem, self).__init__()
        self.in_channels = in_c
        self.out_channels = out_c
        self.stride = stride
        if self.stride == 2:
            self.num_blocks = num_blocks - 1
        else:
            self.num_blocks = num_blocks
        self.kernel_size = kernel_size
        self.act = act
        self.block_type = block_type
        out_c = out_c // 2

        if act is None:
            self.act = torch.relu
        else:
            self.act = get_activation(act)
        self.block_list = nn.ModuleList()
        for block_id in range(self.num_blocks):
            if self.stride == 1 and block_id == 0:
                in_c = in_c // 2
            else:
                in_c = out_c
            the_block = ResConvBlock(
                in_c,
                out_c,
                btn_c,
                kernel_size,
                stride=1,
                act=act,
                reparam=reparam,
                block_type=block_type)
            self.block_list.append(the_block)

    def forward(self, x):
        output = x
        for block in self.block_list:
            output = block(output)
        return output


 class TinyNAS(nn.Module):

    def __init__(self,
                 structure_info=None,
                 out_indices=[2, 3, 4],
                 with_spp=False,
                 use_focus=False,
                 act='silu',
                 reparam=False):
        super(TinyNAS, self).__init__()
        self.out_indices = out_indices
        self.block_list = nn.ModuleList()
        self.stride_list = []

        for idx, block_info in enumerate(structure_info):
            the_block_class = block_info['class']
            if the_block_class == 'ConvKXBNRELU':
                if use_focus and idx == 0:
                    the_block = Focus(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        act=act)
                else:
                    the_block = ConvKXBNRELU(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        block_info['s'],
                        act=act)
            elif the_block_class == 'SuperResConvK1KX':
                the_block = CSPStem(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['s'],
                    block_info['k'],
                    block_info['L'],
                    act=act,
                    reparam=reparam,
                    block_type='k1kx')
            elif the_block_class == 'SuperResConvKXKX':
                the_block = CSPStem(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['s'],
                    block_info['k'],
                    block_info['L'],
                    act=act,
                    reparam=reparam,
                    block_type='kxkx')
            else:
                raise NotImplementedError

            self.block_list.append(the_block)

        self.csp_stage = nn.ModuleList()
        self.csp_stage.append(self.block_list[0])
        self.csp_stage.append(CSPWrapper(self.block_list[1]))
        self.csp_stage.append(CSPWrapper(self.block_list[2]))
        self.csp_stage.append(
            CSPWrapper((self.block_list[3], self.block_list[4])))
        self.csp_stage.append(
            CSPWrapper(self.block_list[5], with_spp=with_spp))
        del self.block_list

    def init_weights(self, pretrain=None):
        pass

    def forward(self, x):
        output = x
        stage_feature_list = []
        for idx, block in enumerate(self.csp_stage):
            output = block(output)
            if idx in self.out_indices:
                stage_feature_list.append(output)
        return stage_feature_list


 class CSPWrapper(nn.Module):

    def __init__(self, convstem, act='relu', reparam=False, with_spp=False):

        super(CSPWrapper, self).__init__()
        self.with_spp = with_spp
        if isinstance(convstem, tuple):
            in_c = convstem[0].in_channels
            out_c = convstem[-1].out_channels
            hidden_dim = convstem[0].out_channels // 2
            _convstem = nn.ModuleList()
            for modulelist in convstem:
                for layer in modulelist.block_list:
                    _convstem.append(layer)
        else:
            in_c = convstem.in_channels
            out_c = convstem.out_channels
            hidden_dim = out_c // 2
            _convstem = convstem.block_list

        self.convstem = nn.ModuleList()
        for layer in _convstem:
            self.convstem.append(layer)

        self.act = get_activation(act)
        self.downsampler = ConvKXBNRELU(
            in_c, hidden_dim * 2, 3, 2, act=self.act)
        if self.with_spp:
            self.spp = SPPBottleneck(hidden_dim * 2, hidden_dim * 2)
        if len(self.convstem) > 0:
            self.conv_start = ConvKXBNRELU(
                hidden_dim * 2, hidden_dim, 1, 1, act=self.act)
            self.conv_shortcut = ConvKXBNRELU(
                hidden_dim * 2, out_c // 2, 1, 1, act=self.act)
            self.conv_fuse = ConvKXBNRELU(out_c, out_c, 1, 1, act=self.act)

    def forward(self, x):
        x = self.downsampler(x)
        if self.with_spp:
            x = self.spp(x)
        if len(self.convstem) > 0:
            shortcut = self.conv_shortcut(x)
            x = self.conv_start(x)
            for block in self.convstem:
                x = block(x)
            x = torch.cat((x, shortcut), dim=1)
            x = self.conv_fuse(x)
        return x


 def load_tinynas_net(backbone_cfg):
    # load masternet model to path
    import ast

    net_structure_str = read_file(backbone_cfg.structure_file)
    struct_str = ''.join([x.strip() for x in net_structure_str])
    struct_info = ast.literal_eval(struct_str)
    for layer in struct_info:
        if 'nbitsA' in layer:
            del layer['nbitsA']
        if 'nbitsW' in layer:
            del layer['nbitsW']

    model = TinyNAS(
        structure_info=struct_info,
        out_indices=backbone_cfg.out_indices,
        with_spp=backbone_cfg.with_spp,
        use_focus=backbone_cfg.use_focus,
        act=backbone_cfg.act,
        reparam=backbone_cfg.reparam)

    return model
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py
@@ -0,0 +1,238 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The DAMO-YOLO implementation is also open-sourced by the authors, and available
 # at https://github.com/tinyvision/damo-yolo.

 import torch
 import torch.nn as nn

 from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv,
                                                             SPPBottleneck,
                                                             get_activation)
 from modelscope.utils.file_utils import read_file


 class ConvKXBN(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride):
        super(ConvKXBN, self).__init__()
        self.conv1 = nn.Conv2d(
            in_c,
            out_c,
            kernel_size,
            stride, (kernel_size - 1) // 2,
            groups=1,
            bias=False)
        self.bn1 = nn.BatchNorm2d(out_c)

    def forward(self, x):
        return self.bn1(self.conv1(x))


 class ConvKXBNRELU(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
        super(ConvKXBNRELU, self).__init__()
        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
        if act is None:
            self.activation_function = torch.relu
        else:
            self.activation_function = get_activation(act)

    def forward(self, x):
        output = self.conv(x)
        return self.activation_function(output)


 class ResConvBlock(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 act='silu',
                 reparam=False,
                 block_type='k1kx'):
        super(ResConvBlock, self).__init__()
        self.stride = stride
        if block_type == 'k1kx':
            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size=1, stride=1)
        else:
            self.conv1 = ConvKXBN(
                in_c, btn_c, kernel_size=kernel_size, stride=1)

        if not reparam:
            self.conv2 = ConvKXBN(btn_c, out_c, kernel_size, stride)
        else:
            self.conv2 = RepConv(
                btn_c, out_c, kernel_size, stride, act='identity')

        self.activation_function = get_activation(act)

        if in_c != out_c and stride != 2:
            self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
        else:
            self.residual_proj = None

    def forward(self, x):
        if self.residual_proj is not None:
            reslink = self.residual_proj(x)
        else:
            reslink = x
        x = self.conv1(x)
        x = self.activation_function(x)
        x = self.conv2(x)
        if self.stride != 2:
            x = x + reslink
        x = self.activation_function(x)
        return x


 class SuperResStem(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 num_blocks,
                 with_spp=False,
                 act='silu',
                 reparam=False,
                 block_type='k1kx'):
        super(SuperResStem, self).__init__()
        if act is None:
            self.act = torch.relu
        else:
            self.act = get_activation(act)
        self.block_list = nn.ModuleList()
        for block_id in range(num_blocks):
            if block_id == 0:
                in_channels = in_c
                out_channels = out_c
                this_stride = stride
                this_kernel_size = kernel_size
            else:
                in_channels = out_c
                out_channels = out_c
                this_stride = 1
                this_kernel_size = kernel_size
            the_block = ResConvBlock(
                in_channels,
                out_channels,
                btn_c,
                this_kernel_size,
                this_stride,
                act=act,
                reparam=reparam,
                block_type=block_type)
            self.block_list.append(the_block)
            if block_id == 0 and with_spp:
                self.block_list.append(
                    SPPBottleneck(out_channels, out_channels))

    def forward(self, x):
        output = x
        for block in self.block_list:
            output = block(output)
        return output


 class TinyNAS(nn.Module):

    def __init__(self,
                 structure_info=None,
                 out_indices=[2, 4, 5],
                 with_spp=False,
                 use_focus=False,
                 act='silu',
                 reparam=False):
        super(TinyNAS, self).__init__()
        self.out_indices = out_indices
        self.block_list = nn.ModuleList()

        for idx, block_info in enumerate(structure_info):
            the_block_class = block_info['class']
            if the_block_class == 'ConvKXBNRELU':
                if use_focus:
                    the_block = Focus(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        act=act)
                else:
                    the_block = ConvKXBNRELU(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        block_info['s'],
                        act=act)
                self.block_list.append(the_block)
            elif the_block_class == 'SuperResConvK1KX':
                spp = with_spp if idx == len(structure_info) - 1 else False
                the_block = SuperResStem(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['k'],
                    block_info['s'],
                    block_info['L'],
                    spp,
                    act=act,
                    reparam=reparam,
                    block_type='k1kx')
                self.block_list.append(the_block)
            elif the_block_class == 'SuperResConvKXKX':
                spp = with_spp if idx == len(structure_info) - 1 else False
                the_block = SuperResStem(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['k'],
                    block_info['s'],
                    block_info['L'],
                    spp,
                    act=act,
                    reparam=reparam,
                    block_type='kxkx')
                self.block_list.append(the_block)
            else:
                raise NotImplementedError

    def init_weights(self, pretrain=None):
        pass

    def forward(self, x):
        output = x
        stage_feature_list = []
        for idx, block in enumerate(self.block_list):
            output = block(output)
            if idx in self.out_indices:
                stage_feature_list.append(output)
        return stage_feature_list


 def load_tinynas_net(backbone_cfg):
    # load masternet model to path
    import ast

    net_structure_str = read_file(backbone_cfg.structure_file)
    struct_str = ''.join([x.strip() for x in net_structure_str])
    struct_info = ast.literal_eval(struct_str)
    for layer in struct_info:
        if 'nbitsA' in layer:
            del layer['nbitsA']
        if 'nbitsW' in layer:
            del layer['nbitsW']

    model = TinyNAS(
        structure_info=struct_info,
        out_indices=backbone_cfg.out_indices,
        with_spp=backbone_cfg.with_spp,
        use_focus=backbone_cfg.use_focus,
        act=backbone_cfg.act,
        reparam=backbone_cfg.reparam)

    return model
--- a/modelscope/models/cv/tinynas_detection/core/init.py
+++ b/modelscope/models/cv/tinynas_detection/core/init.py
@@ -1,2 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
--- a/modelscope/models/cv/tinynas_detection/core/base_ops.py
+++ b/modelscope/models/cv/tinynas_detection/core/base_ops.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 import math

 import torch
--- a/modelscope/models/cv/tinynas_detection/core/neck_ops.py
+++ b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import numpy as np
 import torch
--- a/modelscope/models/cv/tinynas_detection/core/ops.py
+++ b/modelscope/models/cv/tinynas_detection/core/ops.py
@@ -0,0 +1,435 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class SiLU(nn.Module):
    """export-friendly version of nn.SiLU()"""

    @staticmethod
    def forward(x):
        return x * torch.sigmoid(x)


 class Swish(nn.Module):

    def __init__(self, inplace=True):
        super(Swish, self).__init__()
        self.inplace = inplace

    def forward(self, x):
        if self.inplace:
            x.mul_(F.sigmoid(x))
            return x
        else:
            return x * F.sigmoid(x)


 def get_activation(name='silu', inplace=True):
    if name is None:
        return nn.Identity()

    if isinstance(name, str):
        if name == 'silu':
            module = nn.SiLU(inplace=inplace)
        elif name == 'relu':
            module = nn.ReLU(inplace=inplace)
        elif name == 'lrelu':
            module = nn.LeakyReLU(0.1, inplace=inplace)
        elif name == 'swish':
            module = Swish(inplace=inplace)
        elif name == 'hardsigmoid':
            module = nn.Hardsigmoid(inplace=inplace)
        elif name == 'identity':
            module = nn.Identity()
        else:
            raise AttributeError('Unsupported act type: {}'.format(name))
        return module

    elif isinstance(name, nn.Module):
        return name

    else:
        raise AttributeError('Unsupported act type: {}'.format(name))


 def get_norm(name, out_channels, inplace=True):
    if name == 'bn':
        module = nn.BatchNorm2d(out_channels)
    else:
        raise NotImplementedError
    return module


 class ConvBNAct(nn.Module):
    """A Conv2d -> Batchnorm -> silu/leaky relu block"""

    def __init__(
        self,
        in_channels,
        out_channels,
        ksize,
        stride=1,
        groups=1,
        bias=False,
        act='silu',
        norm='bn',
        reparam=False,
    ):
        super().__init__()
        # same padding
        pad = (ksize - 1) // 2
        self.conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=ksize,
            stride=stride,
            padding=pad,
            groups=groups,
            bias=bias,
        )
        if norm is not None:
            self.bn = get_norm(norm, out_channels, inplace=True)
        if act is not None:
            self.act = get_activation(act, inplace=True)
        self.with_norm = norm is not None
        self.with_act = act is not None

    def forward(self, x):
        x = self.conv(x)
        if self.with_norm:
            x = self.bn(x)
        if self.with_act:
            x = self.act(x)
        return x

    def fuseforward(self, x):
        return self.act(self.conv(x))


 class SPPBottleneck(nn.Module):
    """Spatial pyramid pooling layer used in YOLOv3-SPP"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_sizes=(5, 9, 13),
                 activation='silu'):
        super().__init__()
        hidden_channels = in_channels // 2
        self.conv1 = ConvBNAct(
            in_channels, hidden_channels, 1, stride=1, act=activation)
        self.m = nn.ModuleList([
            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
            for ks in kernel_sizes
        ])
        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
        self.conv2 = ConvBNAct(
            conv2_channels, out_channels, 1, stride=1, act=activation)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
        x = self.conv2(x)
        return x


 class Focus(nn.Module):
    """Focus width and height information into channel space."""

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize=1,
                 stride=1,
                 act='silu'):
        super().__init__()
        self.conv = ConvBNAct(
            in_channels * 4, out_channels, ksize, stride, act=act)

    def forward(self, x):
        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
        patch_top_left = x[..., ::2, ::2]
        patch_top_right = x[..., ::2, 1::2]
        patch_bot_left = x[..., 1::2, ::2]
        patch_bot_right = x[..., 1::2, 1::2]
        x = torch.cat(
            (
                patch_top_left,
                patch_bot_left,
                patch_top_right,
                patch_bot_right,
            ),
            dim=1,
        )
        return self.conv(x)


 class BasicBlock_3x3_Reverse(nn.Module):

    def __init__(self,
                 ch_in,
                 ch_hidden_ratio,
                 ch_out,
                 act='relu',
                 shortcut=True):
        super(BasicBlock_3x3_Reverse, self).__init__()
        assert ch_in == ch_out
        ch_hidden = int(ch_in * ch_hidden_ratio)
        self.conv1 = ConvBNAct(ch_hidden, ch_out, 3, stride=1, act=act)
        self.conv2 = RepConv(ch_in, ch_hidden, 3, stride=1, act=act)
        self.shortcut = shortcut

    def forward(self, x):
        y = self.conv2(x)
        y = self.conv1(y)
        if self.shortcut:
            return x + y
        else:
            return y


 class SPP(nn.Module):

    def __init__(
        self,
        ch_in,
        ch_out,
        k,
        pool_size,
        act='swish',
    ):
        super(SPP, self).__init__()
        self.pool = []
        for i, size in enumerate(pool_size):
            pool = nn.MaxPool2d(
                kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
            self.add_module('pool{}'.format(i), pool)
            self.pool.append(pool)
        self.conv = ConvBNAct(ch_in, ch_out, k, act=act)

    def forward(self, x):
        outs = [x]

        for pool in self.pool:
            outs.append(pool(x))
        y = torch.cat(outs, axis=1)

        y = self.conv(y)
        return y


 class CSPStage(nn.Module):

    def __init__(self,
                 block_fn,
                 ch_in,
                 ch_hidden_ratio,
                 ch_out,
                 n,
                 act='swish',
                 spp=False):
        super(CSPStage, self).__init__()

        split_ratio = 2
        ch_first = int(ch_out // split_ratio)
        ch_mid = int(ch_out - ch_first)
        self.conv1 = ConvBNAct(ch_in, ch_first, 1, act=act)
        self.conv2 = ConvBNAct(ch_in, ch_mid, 1, act=act)
        self.convs = nn.Sequential()

        next_ch_in = ch_mid
        for i in range(n):
            if block_fn == 'BasicBlock_3x3_Reverse':
                self.convs.add_module(
                    str(i),
                    BasicBlock_3x3_Reverse(
                        next_ch_in,
                        ch_hidden_ratio,
                        ch_mid,
                        act=act,
                        shortcut=True))
            else:
                raise NotImplementedError
            if i == (n - 1) // 2 and spp:
                self.convs.add_module(
                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
            next_ch_in = ch_mid
        self.conv3 = ConvBNAct(ch_mid * n + ch_first, ch_out, 1, act=act)

    def forward(self, x):
        y1 = self.conv1(x)
        y2 = self.conv2(x)

        mid_out = [y1]
        for conv in self.convs:
            y2 = conv(y2)
            mid_out.append(y2)
        y = torch.cat(mid_out, axis=1)
        y = self.conv3(y)
        return y


 def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
    '''Basic cell for rep-style block, including conv and bn'''
    result = nn.Sequential()
    result.add_module(
        'conv',
        nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=groups,
            bias=False))
    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
    return result


 class RepConv(nn.Module):
    '''RepConv is a basic rep-style block, including training and deploy status
    Code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
    '''

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 stride=1,
                 padding=1,
                 dilation=1,
                 groups=1,
                 padding_mode='zeros',
                 deploy=False,
                 act='relu',
                 norm=None):
        super(RepConv, self).__init__()
        self.deploy = deploy
        self.groups = groups
        self.in_channels = in_channels
        self.out_channels = out_channels

        assert kernel_size == 3
        assert padding == 1

        padding_11 = padding - kernel_size // 2

        if isinstance(act, str):
            self.nonlinearity = get_activation(act)
        else:
            self.nonlinearity = act

        if deploy:
            self.rbr_reparam = nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
                bias=True,
                padding_mode=padding_mode)

        else:
            self.rbr_identity = None
            self.rbr_dense = conv_bn(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                groups=groups)
            self.rbr_1x1 = conv_bn(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
                stride=stride,
                padding=padding_11,
                groups=groups)

    def forward(self, inputs):
        '''Forward process'''
        if hasattr(self, 'rbr_reparam'):
            return self.nonlinearity(self.rbr_reparam(inputs))

        if self.rbr_identity is None:
            id_out = 0
        else:
            id_out = self.rbr_identity(inputs)

        return self.nonlinearity(
            self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)

    def get_equivalent_kernel_bias(self):
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        if kernel1x1 is None:
            return 0
        else:
            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

    def _fuse_bn_tensor(self, branch):
        if branch is None:
            return 0, 0
        if isinstance(branch, nn.Sequential):
            kernel = branch.conv.weight
            running_mean = branch.bn.running_mean
            running_var = branch.bn.running_var
            gamma = branch.bn.weight
            beta = branch.bn.bias
            eps = branch.bn.eps
        else:
            assert isinstance(branch, nn.BatchNorm2d)
            if not hasattr(self, 'id_tensor'):
                input_dim = self.in_channels // self.groups
                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
                                        dtype=np.float32)
                for i in range(self.in_channels):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = torch.from_numpy(kernel_value).to(
                    branch.weight.device)
            kernel = self.id_tensor
            running_mean = branch.running_mean
            running_var = branch.running_var
            gamma = branch.weight
            beta = branch.bias
            eps = branch.eps
        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape(-1, 1, 1, 1)
        return kernel * t, beta - running_mean * gamma / std

    def switch_to_deploy(self):
        if hasattr(self, 'rbr_reparam'):
            return
        kernel, bias = self.get_equivalent_kernel_bias()
        self.rbr_reparam = nn.Conv2d(
            in_channels=self.rbr_dense.conv.in_channels,
            out_channels=self.rbr_dense.conv.out_channels,
            kernel_size=self.rbr_dense.conv.kernel_size,
            stride=self.rbr_dense.conv.stride,
            padding=self.rbr_dense.conv.padding,
            dilation=self.rbr_dense.conv.dilation,
            groups=self.rbr_dense.conv.groups,
            bias=True)
        self.rbr_reparam.weight.data = kernel
        self.rbr_reparam.bias.data = bias
        for para in self.parameters():
            para.detach_()
        self.__delattr__('rbr_dense')
        self.__delattr__('rbr_1x1')
        if hasattr(self, 'rbr_identity'):
            self.__delattr__('rbr_identity')
        if hasattr(self, 'id_tensor'):
            self.__delattr__('id_tensor')
        self.deploy = True
--- a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
+++ b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import numpy as np
 import torch
--- a/modelscope/models/cv/tinynas_detection/core/utils.py
+++ b/modelscope/models/cv/tinynas_detection/core/utils.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import numpy as np
 import torch
--- a/modelscope/models/cv/tinynas_detection/detector.py
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import os.path as osp
 import pickle
@@ -42,7 +42,7 @@ class SingleStageDetector(TorchModel):
        self.conf_thre = config.model.head.nms_conf_thre
        self.nms_thre = config.model.head.nms_iou_thre

        if self.cfg.model.backbone.name == 'TinyNAS':
        if 'TinyNAS' in self.cfg.model.backbone.name:
            self.cfg.model.backbone.structure_file = osp.join(
                model_dir, self.cfg.model.backbone.structure_file)
        self.backbone = build_backbone(self.cfg.model.backbone)
--- a/modelscope/models/cv/tinynas_detection/head/init.py
+++ b/modelscope/models/cv/tinynas_detection/head/init.py
@@ -1,9 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import copy

 from .gfocal_v2_tiny import GFocalHead_Tiny
 from .zero_head import ZeroHead


 def build_head(cfg):
@@ -12,5 +13,7 @@ def build_head(cfg):
    name = head_cfg.pop('name')
    if name == 'GFocalV2':
        return GFocalHead_Tiny(**head_cfg)
    elif name == 'ZeroHead':
        return ZeroHead(**head_cfg)
    else:
        raise NotImplementedError
--- a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
+++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import functools
 from functools import partial
@@ -9,7 +9,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from ..core.base_ops import BaseConv, DWConv
 from modelscope.models.cv.tinynas_detection.core.base_ops import (BaseConv,
                                                                  DWConv)


 class Scale(nn.Module):
--- a/modelscope/models/cv/tinynas_detection/head/zero_head.py
+++ b/modelscope/models/cv/tinynas_detection/head/zero_head.py
@@ -0,0 +1,288 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The DAMO-YOLO implementation is also open-sourced by the authors, and available
 # at https://github.com/tinyvision/damo-yolo.
 from functools import partial

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct


 class Scale(nn.Module):

    def __init__(self, scale=1.0):
        super(Scale, self).__init__()
        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))

    def forward(self, x):
        return x * self.scale


 def multi_apply(func, *args, **kwargs):

    pfunc = partial(func, **kwargs) if kwargs else func
    map_results = map(pfunc, *args)
    return tuple(map(list, zip(*map_results)))


 def distance2bbox(points, distance, max_shape=None):
    """Decode distance prediction to bounding box.
    """
    x1 = points[..., 0] - distance[..., 0]
    y1 = points[..., 1] - distance[..., 1]
    x2 = points[..., 0] + distance[..., 2]
    y2 = points[..., 1] + distance[..., 3]
    if max_shape is not None:
        x1 = x1.clamp(min=0, max=max_shape[1])
        y1 = y1.clamp(min=0, max=max_shape[0])
        x2 = x2.clamp(min=0, max=max_shape[1])
        y2 = y2.clamp(min=0, max=max_shape[0])
    return torch.stack([x1, y1, x2, y2], -1)


 def bbox2distance(points, bbox, max_dis=None, eps=0.1):
    """Decode bounding box based on distances.
    """
    left = points[:, 0] - bbox[:, 0]
    top = points[:, 1] - bbox[:, 1]
    right = bbox[:, 2] - points[:, 0]
    bottom = bbox[:, 3] - points[:, 1]
    if max_dis is not None:
        left = left.clamp(min=0, max=max_dis - eps)
        top = top.clamp(min=0, max=max_dis - eps)
        right = right.clamp(min=0, max=max_dis - eps)
        bottom = bottom.clamp(min=0, max=max_dis - eps)
    return torch.stack([left, top, right, bottom], -1)


 class Integral(nn.Module):
    """A fixed layer for calculating integral result from distribution.
    """

    def __init__(self, reg_max=16):
        super(Integral, self).__init__()
        self.reg_max = reg_max
        self.register_buffer('project',
                             torch.linspace(0, self.reg_max, self.reg_max + 1))

    def forward(self, x):
        """Forward feature from the regression head to get integral result of
        bounding box location.
        """
        b, hw, _, _ = x.size()
        x = x.reshape(b * hw * 4, self.reg_max + 1)
        y = self.project.type_as(x).unsqueeze(1)
        x = torch.matmul(x, y).reshape(b, hw, 4)
        return x


 class ZeroHead(nn.Module):
    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
    Estimation for Dense Object Detection.
    """

    def __init__(
            self,
            num_classes,
            in_channels,
            stacked_convs=4,  # 4
            feat_channels=256,
            reg_max=12,
            strides=[8, 16, 32],
            norm='gn',
            act='relu',
            nms_conf_thre=0.05,
            nms_iou_thre=0.7,
            nms=True,
            **kwargs):
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.stacked_convs = stacked_convs
        self.act = act
        self.strides = strides
        if stacked_convs == 0:
            feat_channels = in_channels
        if isinstance(feat_channels, list):
            self.feat_channels = feat_channels
        else:
            self.feat_channels = [feat_channels] * len(self.strides)
        # add 1 for keep consistance with former models
        self.cls_out_channels = num_classes + 1
        self.reg_max = reg_max

        self.nms = nms
        self.nms_conf_thre = nms_conf_thre
        self.nms_iou_thre = nms_iou_thre

        self.feat_size = [torch.zeros(4) for _ in strides]

        super(ZeroHead, self).__init__()
        self.integral = Integral(self.reg_max)

        self._init_layers()

    def _build_not_shared_convs(self, in_channel, feat_channels):
        cls_convs = nn.ModuleList()
        reg_convs = nn.ModuleList()

        for i in range(self.stacked_convs):
            chn = feat_channels if i > 0 else in_channel
            kernel_size = 3 if i > 0 else 1
            cls_convs.append(
                ConvBNAct(
                    chn,
                    feat_channels,
                    kernel_size,
                    stride=1,
                    groups=1,
                    norm='bn',
                    act=self.act))
            reg_convs.append(
                ConvBNAct(
                    chn,
                    feat_channels,
                    kernel_size,
                    stride=1,
                    groups=1,
                    norm='bn',
                    act=self.act))

        return cls_convs, reg_convs

    def _init_layers(self):
        """Initialize layers of the head."""
        self.cls_convs = nn.ModuleList()
        self.reg_convs = nn.ModuleList()

        for i in range(len(self.strides)):
            cls_convs, reg_convs = self._build_not_shared_convs(
                self.in_channels[i], self.feat_channels[i])
            self.cls_convs.append(cls_convs)
            self.reg_convs.append(reg_convs)

        self.gfl_cls = nn.ModuleList([
            nn.Conv2d(
                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
            for i in range(len(self.strides))
        ])

        self.gfl_reg = nn.ModuleList([
            nn.Conv2d(
                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
            for i in range(len(self.strides))
        ])

        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])

    def forward(self, xin, labels=None, imgs=None, aux_targets=None):
        if self.training:
            return NotImplementedError
        else:
            return self.forward_eval(xin=xin, labels=labels, imgs=imgs)

    def forward_eval(self, xin, labels=None, imgs=None):

        # prepare priors for label assignment and bbox decode
        if self.feat_size[0] != xin[0].shape:
            mlvl_priors_list = [
                self.get_single_level_center_priors(
                    xin[i].shape[0],
                    xin[i].shape[-2:],
                    stride,
                    dtype=torch.float32,
                    device=xin[0].device)
                for i, stride in enumerate(self.strides)
            ]
            self.mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
            self.feat_size[0] = xin[0].shape

        # forward for bboxes and classification prediction
        cls_scores, bbox_preds = multi_apply(
            self.forward_single,
            xin,
            self.cls_convs,
            self.reg_convs,
            self.gfl_cls,
            self.gfl_reg,
            self.scales,
        )
        cls_scores = torch.cat(cls_scores, dim=1)[:, :, :self.num_classes]
        bbox_preds = torch.cat(bbox_preds, dim=1)
        # batch bbox decode
        bbox_preds = self.integral(bbox_preds) * self.mlvl_priors[..., 2, None]
        bbox_preds = distance2bbox(self.mlvl_priors[..., :2], bbox_preds)

        res = torch.cat([bbox_preds, cls_scores[..., 0:self.num_classes]],
                        dim=-1)
        return res

    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, scale):
        """Forward feature of a single scale level.

        """
        cls_feat = x
        reg_feat = x

        for cls_conv, reg_conv in zip(cls_convs, reg_convs):
            cls_feat = cls_conv(cls_feat)
            reg_feat = reg_conv(reg_feat)

        bbox_pred = scale(gfl_reg(reg_feat)).float()
        N, C, H, W = bbox_pred.size()
        if self.training:
            bbox_before_softmax = bbox_pred.reshape(N, 4, self.reg_max + 1, H,
                                                    W)
            bbox_before_softmax = bbox_before_softmax.flatten(
                start_dim=3).permute(0, 3, 1, 2)
        bbox_pred = F.softmax(
            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)

        cls_score = gfl_cls(cls_feat).sigmoid()

        cls_score = cls_score.flatten(start_dim=2).permute(
            0, 2, 1)  # N, h*w, self.num_classes+1
        bbox_pred = bbox_pred.flatten(start_dim=3).permute(
            0, 3, 1, 2)  # N, h*w, 4, self.reg_max+1
        if self.training:
            return cls_score, bbox_pred, bbox_before_softmax
        else:
            return cls_score, bbox_pred

    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
                                       dtype, device):

        h, w = featmap_size
        x_range = (torch.arange(0, int(w), dtype=dtype,
                                device=device)) * stride
        y_range = (torch.arange(0, int(h), dtype=dtype,
                                device=device)) * stride

        x = x_range.repeat(h, 1)
        y = y_range.unsqueeze(-1).repeat(1, w)

        y = y.flatten()
        x = x.flatten()
        strides = x.new_full((x.shape[0], ), stride)
        priors = torch.stack([x, y, strides, strides], dim=-1)

        return priors.unsqueeze(0).repeat(batch_size, 1, 1)

    def sample(self, assign_result, gt_bboxes):
        pos_inds = torch.nonzero(
            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
        neg_inds = torch.nonzero(
            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

        if gt_bboxes.numel() == 0:
            # hack for index error case
            assert pos_assigned_gt_inds.numel() == 0
            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
        else:
            if len(gt_bboxes.shape) < 2:
                gt_bboxes = gt_bboxes.view(-1, 4)
            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]

        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
--- a/modelscope/models/cv/tinynas_detection/neck/init.py
+++ b/modelscope/models/cv/tinynas_detection/neck/init.py
@@ -1,10 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import copy

 from .giraffe_fpn import GiraffeNeck
 from .giraffe_fpn_v2 import GiraffeNeckV2
 from .giraffe_fpn_btn import GiraffeNeckV2


 def build_neck(cfg):
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import collections
 import itertools
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import logging
 import math
@@ -15,7 +15,8 @@ from timm import create_model
 from timm.models.layers import (Swish, create_conv2d, create_pool2d,
                                get_act_layer)

 from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
 from modelscope.models.cv.tinynas_detection.core.base_ops import (
    CSPLayer, ShuffleBlock, ShuffleCSPLayer)
 from .giraffe_config import get_graph_config

 _ACT_LAYER = Swish
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py
@@ -0,0 +1,132 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 import torch
 import torch.nn as nn

 from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct, CSPStage


 class GiraffeNeckV2(nn.Module):

    def __init__(
        self,
        depth=1.0,
        hidden_ratio=1.0,
        in_features=[2, 3, 4],
        in_channels=[256, 512, 1024],
        out_channels=[256, 512, 1024],
        act='silu',
        spp=False,
        block_name='BasicBlock',
    ):
        super().__init__()
        self.in_features = in_features
        self.in_channels = in_channels
        self.out_channels = out_channels
        Conv = ConvBNAct

        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

        # node x3: input x0, x1
        self.bu_conv13 = Conv(in_channels[1], in_channels[1], 3, 2, act=act)
        self.merge_3 = CSPStage(
            block_name,
            in_channels[1] + in_channels[2],
            hidden_ratio,
            in_channels[2],
            round(3 * depth),
            act=act,
            spp=spp)

        # node x4: input x1, x2, x3
        self.bu_conv24 = Conv(in_channels[0], in_channels[0], 3, 2, act=act)
        self.merge_4 = CSPStage(
            block_name,
            in_channels[0] + in_channels[1] + in_channels[2],
            hidden_ratio,
            in_channels[1],
            round(3 * depth),
            act=act,
            spp=spp)

        # node x5: input x2, x4
        self.merge_5 = CSPStage(
            block_name,
            in_channels[1] + in_channels[0],
            hidden_ratio,
            out_channels[0],
            round(3 * depth),
            act=act,
            spp=spp)

        # node x7: input x4, x5
        self.bu_conv57 = Conv(out_channels[0], out_channels[0], 3, 2, act=act)
        self.merge_7 = CSPStage(
            block_name,
            out_channels[0] + in_channels[1],
            hidden_ratio,
            out_channels[1],
            round(3 * depth),
            act=act,
            spp=spp)

        # node x6: input x3, x4, x7
        self.bu_conv46 = Conv(in_channels[1], in_channels[1], 3, 2, act=act)
        self.bu_conv76 = Conv(out_channels[1], out_channels[1], 3, 2, act=act)
        self.merge_6 = CSPStage(
            block_name,
            in_channels[1] + out_channels[1] + in_channels[2],
            hidden_ratio,
            out_channels[2],
            round(3 * depth),
            act=act,
            spp=spp)

    def init_weights(self):
        pass

    def forward(self, out_features):
        """
        Args:
            inputs: input images.

        Returns:
            Tuple[Tensor]: FPN feature.
        """

        #  backbone
        [x2, x1, x0] = out_features

        # node x3
        x13 = self.bu_conv13(x1)
        x3 = torch.cat([x0, x13], 1)
        x3 = self.merge_3(x3)

        # node x4
        x34 = self.upsample(x3)
        x24 = self.bu_conv24(x2)
        x4 = torch.cat([x1, x24, x34], 1)
        x4 = self.merge_4(x4)

        # node x5
        x45 = self.upsample(x4)
        x5 = torch.cat([x2, x45], 1)
        x5 = self.merge_5(x5)

        # node x8
        # x8 = x5

        # node x7
        x57 = self.bu_conv57(x5)
        x7 = torch.cat([x4, x57], 1)
        x7 = self.merge_7(x7)

        # node x6
        x46 = self.bu_conv46(x4)
        x76 = self.bu_conv76(x7)
        x6 = torch.cat([x3, x46, x76], 1)
        x6 = self.merge_6(x6)

        outputs = (x5, x7, x6)
        return outputs
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
@@ -1,200 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import torch
 import torch.nn as nn

 from ..core.base_ops import BaseConv, CSPLayer, DWConv
 from ..core.neck_ops import CSPStage


 class GiraffeNeckV2(nn.Module):

    def __init__(
        self,
        depth=1.0,
        width=1.0,
        in_channels=[256, 512, 1024],
        out_channels=[256, 512, 1024],
        depthwise=False,
        act='silu',
        spp=True,
        reparam_mode=True,
        block_name='BasicBlock',
    ):
        super().__init__()
        self.in_channels = in_channels
        Conv = DWConv if depthwise else BaseConv

        reparam_mode = reparam_mode

        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

        # node x3: input x0, x1
        self.bu_conv13 = Conv(
            int(in_channels[1] * width),
            int(in_channels[1] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_3 = CSPStage(
                block_name,
                int((in_channels[1] + in_channels[2]) * width),
                int(in_channels[2] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_3 = CSPLayer(
                int((in_channels[1] + in_channels[2]) * width),
                int(in_channels[2] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x4: input x1, x2, x3
        self.bu_conv24 = Conv(
            int(in_channels[0] * width),
            int(in_channels[0] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_4 = CSPStage(
                block_name,
                int((in_channels[0] + in_channels[1] + in_channels[2])
                    * width),
                int(in_channels[1] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_4 = CSPLayer(
                int((in_channels[0] + in_channels[1] + in_channels[2])
                    * width),
                int(in_channels[1] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x5: input x2, x4
        if reparam_mode:
            self.merge_5 = CSPStage(
                block_name,
                int((in_channels[1] + in_channels[0]) * width),
                int(out_channels[0] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_5 = CSPLayer(
                int((in_channels[1] + in_channels[0]) * width),
                int(out_channels[0] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x7: input x4, x5
        self.bu_conv57 = Conv(
            int(out_channels[0] * width),
            int(out_channels[0] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_7 = CSPStage(
                block_name,
                int((out_channels[0] + in_channels[1]) * width),
                int(out_channels[1] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_7 = CSPLayer(
                int((out_channels[0] + in_channels[1]) * width),
                int(out_channels[1] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x6: input x3, x4, x7
        self.bu_conv46 = Conv(
            int(in_channels[1] * width),
            int(in_channels[1] * width),
            3,
            2,
            act=act)
        self.bu_conv76 = Conv(
            int(out_channels[1] * width),
            int(out_channels[1] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_6 = CSPStage(
                block_name,
                int((in_channels[1] + out_channels[1] + in_channels[2])
                    * width),
                int(out_channels[2] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_6 = CSPLayer(
                int((in_channels[1] + out_channels[1] + in_channels[2])
                    * width),
                int(out_channels[2] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

    def init_weights(self):
        pass

    def forward(self, out_features):
        """
        Args:
            inputs: input images.

        Returns:
            Tuple[Tensor]: FPN feature.
        """

        #  backbone
        [x2, x1, x0] = out_features

        # node x3
        x13 = self.bu_conv13(x1)
        x3 = torch.cat([x0, x13], 1)
        x3 = self.merge_3(x3)

        # node x4
        x34 = self.upsample(x3)
        x24 = self.bu_conv24(x2)
        x4 = torch.cat([x1, x24, x34], 1)
        x4 = self.merge_4(x4)

        # node x5
        x45 = self.upsample(x4)
        x5 = torch.cat([x2, x45], 1)
        x5 = self.merge_5(x5)

        # node x7
        x57 = self.bu_conv57(x5)
        x7 = torch.cat([x4, x57], 1)
        x7 = self.merge_7(x7)

        # node x6
        x46 = self.bu_conv46(x4)
        x76 = self.bu_conv76(x7)
        x6 = torch.cat([x3, x46, x76], 1)
        x6 = self.merge_6(x6)

        outputs = (x5, x7, x6)
        return outputs
--- a/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
+++ b/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
@@ -11,5 +11,5 @@ from .detector import SingleStageDetector
 class DamoYolo(SingleStageDetector):

    def __init__(self, model_dir, *args, **kwargs):
        self.config_name = 'damoyolo_s.py'
        self.config_name = 'damoyolo.py'
        super(DamoYolo, self).__init__(model_dir, *args, **kwargs)
--- a/modelscope/models/cv/tinynas_detection/tinynas_detector.py
+++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
--- a/modelscope/models/cv/tinynas_detection/utils.py
+++ b/modelscope/models/cv/tinynas_detection/utils.py
@@ -1,30 +1,33 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 # The DAMO-YOLO implementation is also open-sourced by the authors, and available
 # at https://github.com/tinyvision/damo-yolo.

 import importlib
 import os
 import shutil
 import sys
 import tempfile
 from os.path import dirname, join

 from easydict import EasyDict

 def get_config_by_file(config_file):
    try:
        sys.path.append(os.path.dirname(config_file))
        current_config = importlib.import_module(
            os.path.basename(config_file).split('.')[0])
        exp = current_config.Config()
    except Exception:
        raise ImportError(
            "{} doesn't contains class named 'Config'".format(config_file))
    return exp

 def parse_config(filename):
    filename = str(filename)
    if filename.endswith('.py'):
        with tempfile.TemporaryDirectory() as temp_config_dir:
            shutil.copyfile(filename, join(temp_config_dir, '_tempconfig.py'))
            sys.path.insert(0, temp_config_dir)
            mod = importlib.import_module('_tempconfig')
            sys.path.pop(0)
            cfg_dict = EasyDict({
                name: value
                for name, value in mod.__dict__.items()
                if not name.startswith('__')
            })
            # delete imported module
            del sys.modules['_tempconfig']
    else:
        raise IOError('Only .py type are supported now!')

 def parse_config(config_file):
    """
    get config object by file.
    Args:
        config_file (str): file path of config.
    """
    assert (config_file is not None), 'plz provide config file'
    if config_file is not None:
        return get_config_by_file(config_file)
    return cfg_dict
--- a/modelscope/models/cv/video_human_matting/init.py
+++ b/modelscope/models/cv/video_human_matting/init.py
@@ -0,0 +1,21 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .model import VideoMattingNetwork
    from .model import preprocess

 else:
    _import_structure = {'model': ['VideoMattingNetwork', 'preprocess']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/video_human_matting/model.py
+++ b/modelscope/models/cv/video_human_matting/model.py
@@ -0,0 +1,38 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Optional

 import numpy as np
 import torch
 import torchvision
 from torch.nn import functional as F

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.video_human_matting.models import MattingNetwork
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger


@MODELS.register_module(
    Tasks.video_human_matting, module_name=Models.video_human_matting)
 class VideoMattingNetwork(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        params = torch.load(model_path, map_location='cpu')
        self.model = MattingNetwork()
        if 'model_state_dict' in params.keys():
            params = params['model_state_dict']
        self.model.load_state_dict(params, strict=True)
        self.model.eval()


 def preprocess(image):
    frame_np = np.float32(image) / 255.0
    frame_np = frame_np.transpose(2, 0, 1)
    frame_tensor = torch.from_numpy(frame_np)
    image_tensor = frame_tensor[None, :, :, :]
    return image_tensor
--- a/modelscope/models/cv/video_human_matting/models/init.py
+++ b/modelscope/models/cv/video_human_matting/models/init.py
@@ -0,0 +1 @@
 from .matting import MattingNetwork
--- a/modelscope/models/cv/video_human_matting/models/decoder.py
+++ b/modelscope/models/cv/video_human_matting/models/decoder.py
@@ -0,0 +1,330 @@
 """
 Part of the implementation is borrowed from paper RVM
 paper publicly available at <https://arxiv.org/abs/2108.11515/>
 """
 from typing import Optional

 import torch
 from torch import Tensor, nn


 class hswish(nn.Module):

    def forward(self, x):
        return torch.nn.Hardswish(inplace=True)(x)


 class scSEblock(nn.Module):

    def __init__(self, out):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(out, int(out / 2), 3, 1, 1),
            nn.GroupNorm(out // 8, int(out / 2)), hswish())
        self.conv2 = nn.Sequential(
            nn.Conv2d(int(out / 2), out, 1, 1, 0),
            nn.GroupNorm(out // 4, out),
        )
        self.avgpool = nn.AdaptiveAvgPool2d(1)

    def forward_single(self, x):
        b, c, _, _ = x.size()
        x2 = self.avgpool(x).view(b, c, 1, 1)
        x2 = self.conv1(x2)
        x2 = self.conv2(x2)
        x2 = torch.sigmoid(x2)
        out = x2 * x
        return out

    def forward_time(self, x):
        B, T, _, H, W = x.shape
        x = x.flatten(0, 1)
        out = self.forward_single(x)
        out = out.unflatten(0, (B, T))
        return out

    def forward(self, x):
        if x.ndim == 5:
            return self.forward_time(x)
        else:
            return self.forward_single(x)


 class RecurrentDecoder(nn.Module):

    def __init__(self, feature_channels, decoder_channels):
        super().__init__()
        self.avgpool = AvgPool()
        self.decode4 = BottleneckBlock(feature_channels[3])
        self.decode3 = UpsamplingBlock(feature_channels[3],
                                       feature_channels[2], 3,
                                       decoder_channels[0])
        self.sc3 = scSEblock(decoder_channels[0])
        self.decode2 = UpsamplingBlock(decoder_channels[0],
                                       feature_channels[1], 3,
                                       decoder_channels[1])
        self.sc2 = scSEblock(decoder_channels[1])
        self.decode1 = UpsamplingBlock(decoder_channels[1],
                                       feature_channels[0], 3,
                                       decoder_channels[2])
        self.sc1 = scSEblock(decoder_channels[2])
        self.out0 = OutputBlock(decoder_channels[2], 3, decoder_channels[3])

        self.crosslevel1 = crossfeature(feature_channels[3],
                                        feature_channels[1])
        self.crosslevel2 = crossfeature(feature_channels[2],
                                        feature_channels[0])

    def forward(self, s0: Tensor, f1: Tensor, f2: Tensor, f3: Tensor,
                f4: Tensor, r1: Optional[Tensor], r2: Optional[Tensor],
                r3: Optional[Tensor], r4: Optional[Tensor]):
        s2, s3, s4 = self.avgpool(s0)
        x4, r4 = self.decode4(f4, r4)
        x3, r3 = self.decode3(x4, f3, s4, r3)
        x3 = self.sc3(x3)
        f2 = self.crosslevel1(f4, f2)
        x2, r2 = self.decode2(x3, f2, s3, r2)
        x2 = self.sc2(x2)
        f1 = self.crosslevel2(f3, f1)
        x1, r1 = self.decode1(x2, f1, s2, r1)
        x1 = self.sc1(x1)
        out = self.out0(x1, s0)
        return out, r1, r2, r3, r4


 class AvgPool(nn.Module):

    def __init__(self):
        super().__init__()
        self.avgpool = nn.AvgPool2d(
            2, 2, count_include_pad=False, ceil_mode=True)

    def forward_single_frame(self, s0):
        s1 = self.avgpool(s0)
        s2 = self.avgpool(s1)
        s3 = self.avgpool(s2)
        return s1, s2, s3

    def forward_time_series(self, s0):
        B, T = s0.shape[:2]
        s0 = s0.flatten(0, 1)
        s1, s2, s3 = self.forward_single_frame(s0)
        s1 = s1.unflatten(0, (B, T))
        s2 = s2.unflatten(0, (B, T))
        s3 = s3.unflatten(0, (B, T))
        return s1, s2, s3

    def forward(self, s0):
        if s0.ndim == 5:
            return self.forward_time_series(s0)
        else:
            return self.forward_single_frame(s0)


 class crossfeature(nn.Module):

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.avg = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv2d(in_channels, out_channels, 1, 1, 0, bias=False)

    def forward_single_frame(self, x1, x2):
        b, c, _, _ = x1.size()
        x1 = self.avg(x1).view(b, c, 1, 1)
        x1 = self.conv(x1)
        x1 = torch.sigmoid(x1)
        x2 = x1 * x2
        return x2

    def forward_time_series(self, x1, x2):
        b, t = x1.shape[:2]
        x1 = x1.flatten(0, 1)
        x2 = x2.flatten(0, 1)
        x2 = self.forward_single_frame(x1, x2)
        return x2.unflatten(0, (b, t))

    def forward(self, x1, x2):
        if x1.ndim == 5:
            return self.forward_time_series(x1, x2)
        else:
            return self.forward_single_frame(x1, x2)


 class BottleneckBlock(nn.Module):

    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        self.gru = GRU(channels // 2)

    def forward(self, x, r):
        a, b = x.split(self.channels // 2, dim=-3)
        b, r = self.gru(b, r)
        x = torch.cat([a, b], dim=-3)
        return x, r


 class UpsamplingBlock(nn.Module):

    def __init__(self, in_channels, skip_channels, src_channels, out_channels):
        super().__init__()
        self.out_channels = out_channels
        self.upsample = nn.Upsample(
            scale_factor=2, mode='bilinear', align_corners=False)
        self.shortcut = nn.Sequential(
            nn.Conv2d(skip_channels, in_channels, 3, 1, 1, bias=False),
            nn.GroupNorm(in_channels // 4, in_channels), hswish())
        self.att_skip = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, 1, 1, 0, bias=False),
            nn.Sigmoid())
        self.conv = nn.Sequential(
            nn.Conv2d(
                in_channels + in_channels + src_channels,
                out_channels,
                3,
                1,
                1,
                bias=False),
            nn.GroupNorm(out_channels // 4, out_channels),
            hswish(),
        )
        self.gru = GRU(out_channels // 2)

    def forward_single_frame(self, x, f, s, r: Optional[Tensor]):
        x = self.upsample(x)
        x = x[:, :, :s.size(2), :s.size(3)]
        att = self.att_skip(x)
        f = self.shortcut(f)
        f = att * f
        x = torch.cat([x, f, s], dim=1)
        x = self.conv(x)
        a, b = x.split(self.out_channels // 2, dim=1)
        b, r = self.gru(b, r)
        x = torch.cat([a, b], dim=1)
        return x, r

    def forward_time_series(self, x, f, s, r: Optional[Tensor]):
        B, T, _, H, W = s.shape
        x = x.flatten(0, 1)
        f = f.flatten(0, 1)
        s = s.flatten(0, 1)
        x = self.upsample(x)
        x = x[:, :, :H, :W]
        f = self.shortcut(f)
        att = self.att_skip(x)
        f = att * f
        x = torch.cat([x, f, s], dim=1)
        x = self.conv(x)
        x = x.unflatten(0, (B, T))
        a, b = x.split(self.out_channels // 2, dim=2)
        b, r = self.gru(b, r)
        x = torch.cat([a, b], dim=2)
        return x, r

    def forward(self, x, f, s, r: Optional[Tensor]):
        if x.ndim == 5:
            return self.forward_time_series(x, f, s, r)
        else:
            return self.forward_single_frame(x, f, s, r)


 class OutputBlock(nn.Module):

    def __init__(self, in_channels, src_channels, out_channels):
        super().__init__()
        self.upsample = nn.Upsample(
            scale_factor=2, mode='bilinear', align_corners=False)
        self.conv = nn.Sequential(
            nn.Conv2d(
                in_channels + src_channels, out_channels, 3, 1, 1, bias=False),
            nn.GroupNorm(out_channels // 2, out_channels),
            hswish(),
            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
            nn.GroupNorm(out_channels // 2, out_channels),
            hswish(),
        )

    def forward_single_frame(self, x, s):
        x = self.upsample(x)
        x = x[:, :, :s.size(2), :s.size(3)]
        x = torch.cat([x, s], dim=1)
        x = self.conv(x)
        return x

    def forward_time_series(self, x, s):
        B, T, _, H, W = s.shape
        x = x.flatten(0, 1)
        s = s.flatten(0, 1)
        x = self.upsample(x)
        x = x[:, :, :H, :W]
        x = torch.cat([x, s], dim=1)
        x = self.conv(x)
        x = x.unflatten(0, (B, T))
        return x

    def forward(self, x, s):
        if x.ndim == 5:
            return self.forward_time_series(x, s)
        else:
            return self.forward_single_frame(x, s)


 class Projection(nn.Module):

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, 1)

    def forward_single_frame(self, x):
        return self.conv(x)

    def forward_time_series(self, x):
        B, T = x.shape[:2]
        return self.conv(x.flatten(0, 1)).unflatten(0, (B, T))

    def forward(self, x):
        if x.ndim == 5:
            return self.forward_time_series(x)
        else:
            return self.forward_single_frame(x)


 class GRU(nn.Module):

    def __init__(self, channels, kernel_size=3, padding=1):
        super().__init__()
        self.channels = channels
        self.ih = nn.Conv2d(
            channels * 2, channels * 2, kernel_size, padding=padding)
        self.act_ih = nn.Sigmoid()
        self.hh = nn.Conv2d(
            channels * 2, channels, kernel_size, padding=padding)
        self.act_hh = nn.Tanh()

    def forward_single_frame(self, x, pre_fea):
        fea_ih = self.ih(torch.cat([x, pre_fea], dim=1))
        r, z = self.act_ih(fea_ih).split(self.channels, dim=1)
        fea_hh = self.hh(torch.cat([x, r * pre_fea], dim=1))
        c = self.act_hh(fea_hh)
        fea_gru = (1 - z) * pre_fea + z * c
        return fea_gru, fea_gru

    def forward_time_series(self, x, pre_fea):
        o = []
        for xt in x.unbind(dim=1):
            ot, pre_fea = self.forward_single_frame(xt, pre_fea)
            o.append(ot)
        o = torch.stack(o, dim=1)
        return o, pre_fea

    def forward(self, x, pre_fea):
        if pre_fea is None:
            pre_fea = torch.zeros(
                (x.size(0), x.size(-3), x.size(-2), x.size(-1)),
                device=x.device,
                dtype=x.dtype)

        if x.ndim == 5:
            return self.forward_time_series(x, pre_fea)
        else:
            return self.forward_single_frame(x, pre_fea)
--- a/modelscope/models/cv/video_human_matting/models/deep_guided_filter.py
+++ b/modelscope/models/cv/video_human_matting/models/deep_guided_filter.py
@@ -0,0 +1,64 @@
 """
 Part of the implementation is borrowed and modified from DeepGuidedFilter
 publicly available at <https://github.com/wuhuikai/DeepGuidedFilter/>
 """
 import torch
 from torch import nn
 from torch.nn import functional as F


 class DeepGuidedFilterRefiner(nn.Module):

    def __init__(self, hid_channels=16):
        super().__init__()
        self.box_filter = nn.Conv2d(
            4, 4, kernel_size=3, padding=1, bias=False, groups=4)
        self.box_filter.weight.data[...] = 1 / 9
        self.conv = nn.Sequential(
            nn.Conv2d(
                4 * 2 + hid_channels, hid_channels, kernel_size=1, bias=False),
            nn.BatchNorm2d(hid_channels), nn.ReLU(True),
            nn.Conv2d(hid_channels, hid_channels, kernel_size=1, bias=False),
            nn.BatchNorm2d(hid_channels), nn.ReLU(True),
            nn.Conv2d(hid_channels, 4, kernel_size=1, bias=True))

    def forward_single_frame(self, fine_src, base_src, base_fgr, base_pha,
                             base_hid):
        fine_x = torch.cat([fine_src, fine_src.mean(1, keepdim=True)], dim=1)
        base_x = torch.cat([base_src, base_src.mean(1, keepdim=True)], dim=1)
        base_y = torch.cat([base_fgr, base_pha], dim=1)

        mean_x = self.box_filter(base_x)
        mean_y = self.box_filter(base_y)
        cov_xy = self.box_filter(base_x * base_y) - mean_x * mean_y
        var_x = self.box_filter(base_x * base_x) - mean_x * mean_x

        A = self.conv(torch.cat([cov_xy, var_x, base_hid], dim=1))
        b = mean_y - A * mean_x

        H, W = fine_src.shape[2:]
        A = F.interpolate(A, (H, W), mode='bilinear', align_corners=False)
        b = F.interpolate(b, (H, W), mode='bilinear', align_corners=False)

        out = A * fine_x + b
        fgr, pha = out.split([3, 1], dim=1)
        return fgr, pha

    def forward_time_series(self, fine_src, base_src, base_fgr, base_pha,
                            base_hid):
        B, T = fine_src.shape[:2]
        fgr, pha = self.forward_single_frame(
            fine_src.flatten(0, 1), base_src.flatten(0, 1),
            base_fgr.flatten(0, 1), base_pha.flatten(0, 1),
            base_hid.flatten(0, 1))
        fgr = fgr.unflatten(0, (B, T))
        pha = pha.unflatten(0, (B, T))
        return fgr, pha

    def forward(self, fine_src, base_src, base_fgr, base_pha, base_hid):
        if fine_src.ndim == 5:
            return self.forward_time_series(fine_src, base_src, base_fgr,
                                            base_pha, base_hid)
        else:
            return self.forward_single_frame(fine_src, base_src, base_fgr,
                                             base_pha, base_hid)
--- a/modelscope/models/cv/video_human_matting/models/effv2.py
+++ b/modelscope/models/cv/video_human_matting/models/effv2.py
@@ -0,0 +1,177 @@
 """
 Part of the implementation is borrowed and modified from EfficientNetV2
 publicly available at <https://arxiv.org/abs/2104.00298>
 """

 import torch
 import torch.nn.functional


 class SiLU(torch.nn.Module):
    """
    [https://arxiv.org/pdf/1710.05941.pdf]
    """

    def __init__(self, inplace: bool = False):
        super().__init__()
        self.silu = torch.nn.SiLU(inplace=inplace)

    def forward(self, x):
        return self.silu(x)


 class Conv(torch.nn.Module):

    def __init__(self, in_ch, out_ch, activation, k=1, s=1, g=1):
        super().__init__()
        self.conv = torch.nn.Conv2d(
            in_ch, out_ch, k, s, k // 2, 1, g, bias=False)
        self.norm = torch.nn.BatchNorm2d(out_ch, 0.001, 0.01)
        self.silu = activation

    def forward(self, x):
        return self.silu(self.norm(self.conv(x)))


 class SE(torch.nn.Module):
    """
    [https://arxiv.org/pdf/1709.01507.pdf]
    """

    def __init__(self, ch, r):
        super().__init__()
        self.se = torch.nn.Sequential(
            torch.nn.Conv2d(ch, ch // (4 * r), 1), torch.nn.SiLU(),
            torch.nn.Conv2d(ch // (4 * r), ch, 1), torch.nn.Sigmoid())

    def forward(self, x):
        return x * self.se(x.mean((2, 3), keepdim=True))


 class Residual(torch.nn.Module):
    """
    [https://arxiv.org/pdf/1801.04381.pdf]
    """

    def __init__(self, in_ch, out_ch, s, r, fused=True):
        super().__init__()
        identity = torch.nn.Identity()
        if fused:
            if r == 1:
                features = [Conv(in_ch, r * in_ch, torch.nn.SiLU(), 3, s)]
            else:
                features = [
                    Conv(in_ch, r * in_ch, torch.nn.SiLU(), 3, s),
                    Conv(r * in_ch, out_ch, identity)
                ]
        else:
            if r == 1:
                features = [
                    Conv(r * in_ch, r * in_ch, torch.nn.SiLU(), 3, s,
                         r * in_ch),
                    SE(r * in_ch, r),
                    Conv(r * in_ch, out_ch, identity)
                ]
            else:
                features = [
                    Conv(in_ch, r * in_ch, torch.nn.SiLU()),
                    Conv(r * in_ch, r * in_ch, torch.nn.SiLU(), 3, s,
                         r * in_ch),
                    SE(r * in_ch, r),
                    Conv(r * in_ch, out_ch, identity)
                ]
        self.add = s == 1 and in_ch == out_ch
        self.res = torch.nn.Sequential(*features)

    def forward(self, x):
        return x + self.res(x) if self.add else self.res(x)


 class EfficientNet(torch.nn.Module):

    def __init__(self, pretrained: bool = False):
        super().__init__()
        gate_fn = [True, False]
        filters = [24, 48, 64, 128, 160, 256]
        feature = [Conv(3, filters[0], torch.nn.SiLU(), 3, 2)]
        for i in range(2):
            if i == 0:
                feature.append(
                    Residual(filters[0], filters[0], 1, 1, gate_fn[0]))
            else:
                feature.append(
                    Residual(filters[0], filters[0], 1, 1, gate_fn[0]))

        for i in range(4):
            if i == 0:
                feature.append(
                    Residual(filters[0], filters[1], 2, 4, gate_fn[0]))
            else:
                feature.append(
                    Residual(filters[1], filters[1], 1, 4, gate_fn[0]))

        for i in range(4):
            if i == 0:
                feature.append(
                    Residual(filters[1], filters[2], 2, 4, gate_fn[0]))
            else:
                feature.append(
                    Residual(filters[2], filters[2], 1, 4, gate_fn[0]))

        for i in range(6):
            if i == 0:
                feature.append(
                    Residual(filters[2], filters[3], 2, 4, gate_fn[1]))
            else:
                feature.append(
                    Residual(filters[3], filters[3], 1, 4, gate_fn[1]))

        for i in range(9):
            if i == 0:
                feature.append(
                    Residual(filters[3], filters[4], 1, 6, gate_fn[1]))
            else:
                feature.append(
                    Residual(filters[4], filters[4], 1, 6, gate_fn[1]))

        self.feature = torch.nn.Sequential(*feature)

    def forward_single_frame(self, x):
        x = self.feature[0](x)
        x = self.feature[1](x)
        x = self.feature[2](x)
        f1 = x  # 1/2 24
        for i in range(4):
            x = self.feature[i + 3](x)
        f2 = x  # 1/4 48
        for i in range(4):
            x = self.feature[i + 7](x)
        f3 = x  # 1/8 64
        for i in range(6):
            x = self.feature[i + 11](x)
        for i in range(9):
            x = self.feature[i + 17](x)
        f5 = x  # 1/16 160
        return [f1, f2, f3, f5]

    def forward_time_series(self, x):
        B, T = x.shape[:2]
        features = self.forward_single_frame(x.flatten(0, 1))
        features = [f.unflatten(0, (B, T)) for f in features]
        return features

    def forward(self, x):
        if x.ndim == 5:
            return self.forward_time_series(x)
        else:
            return self.forward_single_frame(x)

    def export(self):
        for m in self.modules():
            if type(m) is Conv and hasattr(m, 'silu'):
                if isinstance(m.silu, torch.nn.SiLU):
                    m.silu = SiLU()
            if type(m) is SE:
                if isinstance(m.se[1], torch.nn.SiLU):
                    m.se[1] = SiLU()
        return self
--- a/modelscope/models/cv/video_human_matting/models/lraspp.py
+++ b/modelscope/models/cv/video_human_matting/models/lraspp.py
@@ -0,0 +1,94 @@
 """
 Part of the implementation is borrowed and modified from Deeplab v3
 publicly available at <https://arxiv.org/abs/1706.05587v3>
 """
 import torch
 from torch import nn


 class ASP_OC_Module(nn.Module):

    def __init__(self, features, out_features=96, dilations=(2, 4, 8)):
        super(ASP_OC_Module, self).__init__()
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                features,
                out_features,
                kernel_size=1,
                padding=0,
                dilation=1,
                bias=False), nn.BatchNorm2d(out_features))
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                features,
                out_features,
                kernel_size=3,
                padding=dilations[0],
                dilation=dilations[0],
                bias=False), nn.BatchNorm2d(out_features))
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                features,
                out_features,
                kernel_size=3,
                padding=dilations[1],
                dilation=dilations[1],
                bias=False), nn.BatchNorm2d(out_features))
        self.conv5 = nn.Sequential(
            nn.Conv2d(
                features,
                out_features,
                kernel_size=3,
                padding=dilations[2],
                dilation=dilations[2],
                bias=False), nn.BatchNorm2d(out_features))

        self.conv_bn_dropout = nn.Sequential(
            nn.Conv2d(
                out_features * 4,
                out_features * 2,
                kernel_size=1,
                padding=0,
                dilation=1,
                bias=False), nn.InstanceNorm2d(out_features * 2),
            nn.Dropout2d(0.05))

    def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
        assert (len(feat1) == len(feat2))
        z = []
        for i in range(len(feat1)):
            z.append(
                torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]),
                          1))
        return z

    def forward(self, x):
        _, _, h, w = x.size()
        feat2 = self.conv2(x)
        feat3 = self.conv3(x)
        feat4 = self.conv4(x)
        feat5 = self.conv5(x)
        out = torch.cat((feat2, feat3, feat4, feat5), 1)
        output = self.conv_bn_dropout(out)
        return output


 class LRASPP(nn.Module):

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.aspp = ASP_OC_Module(in_channels, out_channels)

    def forward_single_frame(self, x):
        return self.aspp(x)

    def forward_time_series(self, x):
        B, T = x.shape[:2]
        x = self.forward_single_frame(x.flatten(0, 1)).unflatten(0, (B, T))
        return x

    def forward(self, x):
        if x.ndim == 5:
            return self.forward_time_series(x)
        else:
            return self.forward_single_frame(x)
--- a/modelscope/models/cv/video_human_matting/models/matting.py
+++ b/modelscope/models/cv/video_human_matting/models/matting.py
@@ -0,0 +1,67 @@
 from typing import Optional

 import torch
 from torch import Tensor
 from torch.nn import functional as F

 from .decoder import Projection, RecurrentDecoder
 from .deep_guided_filter import DeepGuidedFilterRefiner
 from .effv2 import EfficientNet
 from .lraspp import LRASPP


 class MattingNetwork(torch.nn.Module):

    def __init__(self, pretrained_backbone: bool = False):
        super().__init__()
        self.backbone = EfficientNet(pretrained_backbone)
        self.aspp = LRASPP(160, 64)
        self.decoder = RecurrentDecoder([24, 48, 64, 128], [64, 32, 24, 16])
        self.project_mat = Projection(16, 4)
        self.project_seg = Projection(16, 1)
        self.refiner = DeepGuidedFilterRefiner()

    def forward(self,
                src: Tensor,
                r0: Optional[Tensor] = None,
                r1: Optional[Tensor] = None,
                r2: Optional[Tensor] = None,
                r3: Optional[Tensor] = None,
                downsample_ratio: float = 1,
                segmentation_pass: bool = False):

        if downsample_ratio != 1:
            src_sm = self._interpolate(src, scale_factor=downsample_ratio)
        else:
            src_sm = src

        f1, f2, f3, f4 = self.backbone(src_sm)
        f4 = self.aspp(f4)
        hid, *rec = self.decoder(src_sm, f1, f2, f3, f4, r0, r1, r2, r3)

        if not segmentation_pass:
            fgr_residual, pha = self.project_mat(hid).split([3, 1], dim=-3)
            if downsample_ratio != 1:
                _, pha = self.refiner(src, src_sm, fgr_residual, pha, hid)
            pha = pha.clamp(0., 1.)
            return [pha, *rec]
        else:
            seg = self.project_seg(hid)
            return [seg, *rec]

    def _interpolate(self, x: Tensor, scale_factor: float):
        if x.ndim == 5:
            B, T = x.shape[:2]
            x = F.interpolate(
                x.flatten(0, 1),
                scale_factor=scale_factor,
                mode='bilinear',
                align_corners=False)
            x = x.unflatten(0, (B, T))
        else:
            x = F.interpolate(
                x,
                scale_factor=scale_factor,
                mode='bilinear',
                align_corners=False)
        return x
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -509,8 +509,8 @@ def convert_weights(model: nn.Module):
@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
 class CLIPForMultiModalEmbedding(TorchModel):

    def __init__(self, model_dir, device_id=-1):
        super().__init__(model_dir=model_dir, device_id=device_id)
    def __init__(self, model_dir, *args, **kwargs):
        super().__init__(model_dir=model_dir, *args, **kwargs)

        # Initialize the model.
        vision_model_config_file = '{}/vision_model_config.json'.format(