Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts: # modelscope/preprocessors/multi_modal.py
3 years ago · c546f2a8b9
--- a/.dev_scripts/build_image.sh
+++ b/.dev_scripts/build_image.sh
@@ -0,0 +1,169 @@
 #!/bin/bash
 # default values.
 BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04
 BASE_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
 MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope
 python_version=3.7.13
 torch_version=1.11.0
 cudatoolkit_version=11.3
 tensorflow_version=1.15.5
 modelscope_version=None
 is_ci_test=False
 is_dsw=False
 is_cpu=False
 run_ci_test=False
 function usage(){
    echo "usage: build.sh "
    echo "       --python=python_version set python version, default: $python_version"
    echo "       --torch=torch_version set pytorch version, fefault: $torch_version"
    echo "       --cudatoolkit=cudatoolkit_version set cudatoolkit version used for pytorch, default: $cudatoolkit_version"
    echo "       --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version"
    echo "       --modelscope=modelscope_version set modelscope version, default: $modelscope_version"
    echo "       --test option for run test before push image, only push on ci test pass"
    echo "       --cpu option for build cpu version"
    echo "       --dsw option for build dsw version"
    echo "       --ci  option for build ci version"
    echo "       --push option for push image to remote repo"
 }
 for i in "$@"; do
  case $i in
    --python=*)
      python_version="${i#*=}"
      shift
      ;;
    --torch=*)
      torch_version="${i#*=}"
      shift # pytorch version
      ;;
    --tensorflow=*)
      tensorflow_version="${i#*=}"
      shift # tensorflow version
      ;;
    --cudatoolkit=*)
      cudatoolkit_version="${i#*=}"
      shift # cudatoolkit for pytorch
      ;;
    --modelscope=*)
      modelscope_version="${i#*=}"
      shift # cudatoolkit for pytorch
      ;;
    --test)
      run_ci_test=True
      shift # will run ci test
      ;;
    --cpu)
      is_cpu=True
      shift # is cpu image
      ;;
    --ci)
      is_ci_test=True
      shift # is ci, will not install modelscope
      ;;
    --dsw)
      is_dsw=True
      shift # is dsw, will set dsw cache location
      ;;
    --push)
      is_push=True
      shift # is dsw, will set dsw cache location
      ;;
    --help)
      usage
      exit 0
      ;;
    -*|--*)
      echo "Unknown option $i"
      usage
      exit 1
      ;;
    *)
      ;;
  esac
 done

 if [ "$modelscope_version" == "None" ]; then
    echo "ModelScope version must specify!"
    exit 1
 fi
 if [ "$is_cpu" == "True" ]; then
    export BASE_IMAGE=$BASE_CPU_IMAGE
    base_tag=ubuntu20.04
    export USE_GPU=False
 else
    export BASE_IMAGE=$BASE_GPU_IMAGE
    base_tag=ubuntu20.04-cuda11.3.0
    export USE_GPU=True
 fi
 if [[ $python_version == 3.7* ]]; then
    base_tag=$base_tag-py37
 elif [[ $python_version == z* ]]; then
    base_tag=$base_tag-py38
 elif [[ $python_version == z* ]]; then
    base_tag=$base_tag-py39
 else
    echo "Unsupport python version: $python_version"
    exit 1
 fi

 target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version
 if [ "$is_ci_test" == "True" ]; then
    target_image_tag=$target_image_tag-$modelscope_version-ci
 else
    target_image_tag=$target_image_tag-$modelscope_version-test
 fi
 export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag
 export PYTHON_VERSION=$python_version
 export TORCH_VERSION=$torch_version
 export CUDATOOLKIT_VERSION=$cudatoolkit_version
 export TENSORFLOW_VERSION=$tensorflow_version
 echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n"
 docker_file_content=`cat docker/Dockerfile.ubuntu`
 if [ "$is_ci_test" != "True" ]; then
    echo "Building ModelScope lib, will install ModelScope lib to image"
    docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir  modelscope==$modelscope_version -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html"
 fi
 echo "$is_dsw"
 if [ "$is_dsw" == "False" ]; then
    echo "Not DSW image"
 else
    echo "Building dsw image well need set ModelScope lib cache location."
    docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope"
 fi
 printf "$docker_file_content" > Dockerfile
 docker build -t $IMAGE_TO_BUILD  \
             --build-arg USE_GPU \
             --build-arg BASE_IMAGE \
             --build-arg PYTHON_VERSION \
             --build-arg TORCH_VERSION \
             --build-arg CUDATOOLKIT_VERSION \
             --build-arg TENSORFLOW_VERSION \
             -f Dockerfile .

 if [ $? -ne 0 ]; then
  echo "Running docker build command error, please check the log!"
  exit -1
 fi
 if [ "$run_ci_test" == "True" ]; then
    echo "Running ci case."
    export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache
    export MODELSCOPE_HOME_CACHE=/home/mulin.lyh/ci_case_home # for credential
    export IMAGE_NAME=$MODELSCOPE_REPO_ADDRESS
    export IMAGE_VERSION=$target_image_tag
    export MODELSCOPE_DOMAIN=www.modelscope.cn
    export HUB_DATASET_ENDPOINT=http://www.modelscope.cn
    export CI_TEST=True
    export TEST_LEVEL=1
    if [ "$is_ci_test" != "True" ]; then
        echo "Testing for dsw image or MaaS-lib image"
        export CI_COMMAND="python tests/run.py"
    fi
    bash .dev_scripts/dockerci.sh
    if [ $? -ne 0 ]; then
       echo "Running unittest failed, please check the log!"
       exit -1
    fi
 fi
 if [ "$is_push" == "True" ]; then
    echo "Pushing image: $IMAGE_TO_BUILD"
    docker push $IMAGE_TO_BUILD
 fi
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -16,5 +16,14 @@ if [ $? -ne 0 ]; then
    echo "linter test failed, please run 'pre-commit run --all-files' to check"
    exit -1
 fi
 # test with install
 python setup.py install

 PYTHONPATH=. python tests/run.py
 if [ $# -eq 0 ]; then
    ci_command="python tests/run.py --subprocess"
 else
    ci_command="$@"
 fi
 echo "Running case with command: $ci_command"
 $ci_command
 #python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
 IMAGE_NAME=reg.docker.alibaba-inc.com/dinger/modelscope
 MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
 CODE_DIR=$PWD
 CODE_DIR_IN_CONTAINER=/Maas-lib
@@ -8,6 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
 CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
 echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
  exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
@@ -31,10 +32,12 @@ do
             -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
             -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
             -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
             -e TEST_LEVEL=$TEST_LEVEL \
             -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
             --workdir=$CODE_DIR_IN_CONTAINER \
             --net host  \
             ${IMAGE_NAME}:${IMAGE_VERSION} \
             bash .dev_scripts/ci_container_test.sh
             $CI_COMMAND
  if [ $? -ne 0 ]; then
    echo "Running test case failed, please check the log!"
    exit -1
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,11 @@
 .gitignore
 tests
 data
 .dev_scripts
 .dockerignore
 .git
 .gitattributes
 .pre-commit-config.yaml
 .pre-commit-config_local.yaml
 .readthedocs.yaml
 Dockfile
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,4 +4,6 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.avi filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
--- a/configs/cv/configuration.json
+++ b/configs/cv/configuration.json
@@ -2,7 +2,6 @@
    "framework": "pytorch",

    "task": "image_classification",
    "work_dir": "./work_dir",

    "model": {
        "type": "classification",
@@ -119,6 +118,7 @@
    },

    "train": {
        "work_dir": "./work_dir",
        "dataloader": {
            "batch_size_per_gpu": 2,
            "workers_per_gpu": 1
--- a/data/test/images/image-text-retrieval.jpg
+++ b/data/test/images/image-text-retrieval.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:b012c7e966f6550874ccb85ef9602d483aa89b8623dff9ffcdb0faab8f2ca9ab
 size 218143
--- a/data/test/images/image_panoptic_segmentation.jpg
+++ b/data/test/images/image_panoptic_segmentation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
 size 245864
--- a/data/test/images/image_reid_person.jpg
+++ b/data/test/images/image_reid_person.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:4c9a7e42edc7065c16972ff56267aad63f5233e36aa5a699b84939f5bad73276
 size 2451
--- a/data/test/images/image_segmentation.jpg
+++ b/data/test/images/image_segmentation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
 size 146140
--- a/data/test/images/image_semantic_segmentation.jpg
+++ b/data/test/images/image_semantic_segmentation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
 size 245864
--- a/data/test/regression/fill_mask_bert_zh.bin
+++ b/data/test/regression/fill_mask_bert_zh.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:541183383bb06aa3ca2c44a68cd51c1be5e3e984a1dee2c58092b9552660f3ce
 size 61883
--- a/data/test/regression/fill_mask_sbert_en.bin
+++ b/data/test/regression/fill_mask_sbert_en.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:8f0afcd9d2aa5ac9569114203bd9db4f1a520c903a88fd4854370cdde0e7eab7
 size 119940
--- a/data/test/regression/fill_mask_sbert_zh.bin
+++ b/data/test/regression/fill_mask_sbert_zh.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
 size 119940
--- a/data/test/regression/fill_mask_veco_en.bin
+++ b/data/test/regression/fill_mask_veco_en.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
 size 119619
--- a/data/test/regression/fill_mask_veco_zh.bin
+++ b/data/test/regression/fill_mask_veco_zh.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
 size 119619
--- a/data/test/regression/sbert_nli.bin
+++ b/data/test/regression/sbert_nli.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62
 size 62231
--- a/data/test/regression/sbert_sen_sim.bin
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a
 size 62235
--- a/data/test/regression/sbert_ws_en.bin
+++ b/data/test/regression/sbert_ws_en.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
 size 60801
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
 size 60801
--- a/data/test/regression/sbert_zero_shot.bin
+++ b/data/test/regression/sbert_zero_shot.bin
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
 size 61589
--- a/data/test/videos/Walking.54138969.mp4
+++ b/data/test/videos/Walking.54138969.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c
 size 44217644
--- a/data/test/videos/movie_scene_segmentation_test_video.mp4
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
 size 126815483
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -0,0 +1,84 @@
 ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
 FROM $BASE_IMAGE
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=Asia/Shanghai
 ENV CONDA_DIR /opt/conda
 ENV PATH="${CONDA_DIR}/bin:${PATH}"
 ENV arch=x86_64
 SHELL ["/bin/bash", "-c"]
 COPY docker/rcfiles /tmp/resources
 RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
    cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \
    apt-get update && \
    apt-get install -y locales wget git  vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
    wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
    dpkg -i ./git-lfs_3.2.0_amd64.deb && \
    rm -f ./git-lfs_3.2.0_amd64.deb && \
    locale-gen zh_CN && \
    locale-gen zh_CN.utf8 && \
    update-locale LANG=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 && \
    ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
    dpkg-reconfigure --frontend noninteractive tzdata && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

 ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8

 #install and config python
 ARG PYTHON_VERSION=3.7.13
 RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \
    /bin/bash  miniconda.sh -b -p /opt/conda && \
    rm  -f miniconda.sh && \
    ln  -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
    cp /tmp/resources/conda.tuna  ~/.condarc && \
    source /root/.bashrc && \
    conda install --yes python==${PYTHON_VERSION} && \
    pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

 ARG USE_GPU=True

 # install pytorch
 ARG TORCH_VERSION=1.12.0
 ARG CUDATOOLKIT_VERSION=11.3
 RUN if [ "$USE_GPU" = "True" ] ; then \
        conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \
    else \
        conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \
    fi

 # install tensorflow
 ARG TENSORFLOW_VERSION=1.15.5
 RUN if [ "$USE_GPU" = "True" ] ; then \
        pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
    else \
        pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
    fi

 RUN if [ "$USE_GPU" = "True" ] ; then \
        CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir mmcv-full && pip cache purge; \
    else \
        MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir mmcv-full && pip cache purge; \
    fi

 # install modelscope
 COPY requirements /var/modelscope
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip cache purge

 # default shell bash
 ENV SHELL=/bin/bash

 # install special package
 RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0

 RUN if [ "$USE_GPU" = "True" ] ; then \
        pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
    else \
        pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
    fi
--- a/docker/rcfiles/conda.tuna
+++ b/docker/rcfiles/conda.tuna
@@ -0,0 +1,15 @@
 channels:
  - defaults
 show_channel_urls: true
 default_channels:
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
 custom_channels:
  conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
--- a/docker/rcfiles/ubuntu20.04_sources.tuna
+++ b/docker/rcfiles/ubuntu20.04_sources.tuna
@@ -0,0 +1,13 @@
 # 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
 deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
 # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
 deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
 # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
 deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
 # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
 deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
 # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse

 # 预发布软件源，不建议启用
 # deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
 # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -108,7 +108,7 @@ pip install -e ".[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releas
 ```shell
 pip install -e ".[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 ###

 ### 安装验证

 安装成功后，可以执行如下命令进行验证安装是否正确：
--- a/modelscope/fileio/init.py
+++ b/modelscope/fileio/init.py
@@ -1,2 +1,2 @@
 from .file import File
 from .file import File, LocalStorage
 from .io import dump, dumps, load
--- a/modelscope/fileio/file.py
+++ b/modelscope/fileio/file.py
@@ -240,7 +240,7 @@ class File(object):
    @staticmethod
    def _get_storage(uri):
        assert isinstance(uri,
                          str), f'uri should be str type, buf got {type(uri)}'
                          str), f'uri should be str type, but got {type(uri)}'

        if '://' not in uri:
            # local path
--- a/modelscope/fileio/format/json.py
+++ b/modelscope/fileio/format/json.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import json
 import numpy as np

 from .base import FormatHandler
@@ -22,14 +21,16 @@ def set_default(obj):


 class JsonHandler(FormatHandler):
    """Use jsonplus, serialization of Python types to JSON that "just works"."""

    def load(self, file):
        return json.load(file)
        import jsonplus
        return jsonplus.loads(file.read())

    def dump(self, obj, file, **kwargs):
        kwargs.setdefault('default', set_default)
        json.dump(obj, file, **kwargs)
        file.write(self.dumps(obj, **kwargs))

    def dumps(self, obj, **kwargs):
        import jsonplus
        kwargs.setdefault('default', set_default)
        return json.dumps(obj, **kwargs)
        return jsonplus.dumps(obj, **kwargs)
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,7 +1,6 @@
 import os
 import pickle
 import shutil
 import subprocess
 from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
@@ -16,8 +15,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                      API_RESPONSE_FIELD_MESSAGE,
                                      API_RESPONSE_FIELD_USERNAME,
                                      DEFAULT_CREDENTIALS_PATH)
 from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
                                          HUB_DATASET_ENDPOINT)
 from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       DEFAULT_MODEL_REVISION,
                                       DatasetFormations, DatasetMetaFormats,
@@ -26,7 +24,8 @@ from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                     datahub_raise_on_error, handle_http_response, is_ok,
                     raise_on_error)
 from .utils.utils import get_endpoint, model_id_to_group_owner_name
 from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
                          model_id_to_group_owner_name)

 logger = get_logger()

@@ -35,7 +34,8 @@ class HubApi:

    def __init__(self, endpoint=None, dataset_endpoint=None):
        self.endpoint = endpoint if endpoint is not None else get_endpoint()
        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT
        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
        )

    def login(
        self,
@@ -376,6 +376,27 @@ class HubApi:
                      f'ststoken?Revision={revision}'
        return self.datahub_remote_call(datahub_url)

    def get_dataset_access_config_session(
            self,
            cookies: CookieJar,
            dataset_name: str,
            namespace: str,
            revision: Optional[str] = DEFAULT_DATASET_REVISION):

        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                      f'ststoken?Revision={revision}'

        cookies = requests.utils.dict_from_cookiejar(cookies)
        r = requests.get(url=datahub_url, cookies=cookies)
        resp = r.json()
        datahub_raise_on_error(datahub_url, resp)
        return resp['Data']

    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
        r = requests.post(url)
        r.raise_for_status()

    @staticmethod
    def datahub_remote_call(url):
        r = requests.get(url)
@@ -383,6 +404,9 @@ class HubApi:
        datahub_raise_on_error(url, resp)
        return resp['Data']

    def check_cookies_upload_data(self, use_cookies) -> CookieJar:
        return self._check_cookie(use_cookies=use_cookies)


 class ModelScopeConfig:
    path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,3 +1,5 @@
 from pathlib import Path

 MODELSCOPE_URL_SCHEME = 'http://'
 DEFAULT_MODELSCOPE_DOMAIN = 'www.modelscope.cn'
 DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_DOMAIN
@@ -6,7 +8,7 @@ DEFAULT_MODELSCOPE_GROUP = 'damo'
 MODEL_ID_SEPARATOR = '/'
 FILE_HASH = 'Sha256'
 LOGGER_NAME = 'ModelScopeHub'
 DEFAULT_CREDENTIALS_PATH = '~/.modelscope/credentials'
 DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials')
 API_RESPONSE_FIELD_DATA = 'Data'
 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
 API_RESPONSE_FIELD_USERNAME = 'Username'
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -49,8 +49,8 @@ def handle_http_response(response, logger, cookies, model_id):
    except HTTPError:
        if cookies is None:  # code in [403] and
            logger.error(
                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be private. \
                  Please login first.')
                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                private. Please login first.')
        raise


--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -2,7 +2,8 @@ import os
 from typing import Optional

 from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       DEFAULT_MODEL_REVISION)
 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .git import GitCommandWrapper
@@ -15,14 +16,12 @@ class Repository:
    """A local representation of the model git repository.
    """

    def __init__(
        self,
        model_dir: str,
        clone_from: str,
        revision: Optional[str] = DEFAULT_MODEL_REVISION,
        auth_token: Optional[str] = None,
        git_path: Optional[str] = None,
    ):
    def __init__(self,
                 model_dir: str,
                 clone_from: str,
                 revision: Optional[str] = DEFAULT_MODEL_REVISION,
                 auth_token: Optional[str] = None,
                 git_path: Optional[str] = None):
        """
        Instantiate a Repository object by cloning the remote ModelScopeHub repo
        Args:
@@ -86,6 +85,7 @@ class Repository:
             branch: Optional[str] = DEFAULT_MODEL_REVISION,
             force: bool = False):
        """Push local files to remote, this method will do.
           git pull
           git add
           git commit
           git push
@@ -117,3 +117,105 @@ class Repository:
            url=url,
            local_branch=branch,
            remote_branch=branch)


 class DatasetRepository:
    """A local representation of the dataset (metadata) git repository.
    """

    def __init__(self,
                 repo_work_dir: str,
                 dataset_id: str,
                 revision: Optional[str] = DEFAULT_DATASET_REVISION,
                 auth_token: Optional[str] = None,
                 git_path: Optional[str] = None):
        """
        Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo
        Args:
            repo_work_dir(`str`):
                The dataset repo root directory.
            dataset_id:
                dataset id in ModelScope from which git clone
            revision(`Optional[str]`):
                revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash
            auth_token(`Optional[str]`):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you login the first time, if None, we will use saved token.
            git_path:(`Optional[str]`):
                The git command line path, if None, we use 'git'
        """
        self.dataset_id = dataset_id
        self.repo_work_dir = repo_work_dir
        self.repo_base_dir = os.path.dirname(repo_work_dir)
        self.repo_name = os.path.basename(repo_work_dir)
        self.revision = revision
        if auth_token:
            self.auth_token = auth_token
        else:
            self.auth_token = ModelScopeConfig.get_token()

        self.git_wrapper = GitCommandWrapper(git_path)
        os.makedirs(self.repo_work_dir, exist_ok=True)
        self.repo_url = self._get_repo_url(dataset_id=dataset_id)

    def clone(self) -> str:
        # check local repo dir, directory not empty.
        if os.listdir(self.repo_work_dir):
            remote_url = self._get_remote_url()
            remote_url = self.git_wrapper.remove_token_from_url(remote_url)
            # no need clone again
            if remote_url and remote_url == self.repo_url:
                return ''

        logger.info('Cloning repo from {} '.format(self.repo_url))
        self.git_wrapper.clone(self.repo_base_dir, self.auth_token,
                               self.repo_url, self.repo_name, self.revision)
        return self.repo_work_dir

    def push(self,
             commit_message: str,
             branch: Optional[str] = DEFAULT_DATASET_REVISION,
             force: bool = False):
        """Push local files to remote, this method will do.
           git pull
           git add
           git commit
           git push
        Args:
            commit_message (str): commit message
            branch (Optional[str], optional): which branch to push.
            force (Optional[bool]): whether to use forced-push.
        """
        if commit_message is None or not isinstance(commit_message, str):
            msg = 'commit_message must be provided!'
            raise InvalidParameter(msg)

        if not isinstance(force, bool):
            raise InvalidParameter('force must be bool')

        if not self.auth_token:
            raise NotLoginException('Must login to push, please login first.')

        self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
        self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)

        remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
        self.git_wrapper.pull(self.repo_work_dir)
        self.git_wrapper.add(self.repo_work_dir, all_files=True)
        self.git_wrapper.commit(self.repo_work_dir, commit_message)
        self.git_wrapper.push(
            repo_dir=self.repo_work_dir,
            token=self.auth_token,
            url=remote_url,
            local_branch=branch,
            remote_branch=branch)

    def _get_repo_url(self, dataset_id):
        return f'{get_endpoint()}/datasets/{dataset_id}.git'

    def _get_remote_url(self):
        try:
            remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
        except GitError:
            remote = None
        return remote
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -1,7 +1,9 @@
 import hashlib
 import os
 from typing import Optional

 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
                                      DEFAULT_MODELSCOPE_DOMAIN,
                                      DEFAULT_MODELSCOPE_GROUP,
                                      MODEL_ID_SEPARATOR,
                                      MODELSCOPE_URL_SCHEME)
@@ -22,14 +24,16 @@ def model_id_to_group_owner_name(model_id):
    return group_or_owner, name


 def get_cache_dir():
 def get_cache_dir(model_id: Optional[str] = None):
    """
    cache dir precedence:
        function parameter > enviroment > ~/.cache/modelscope/hub
    """
    default_cache_dir = get_default_cache_dir()
    return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
                                                      'hub'))
    base_path = os.getenv('MODELSCOPE_CACHE',
                          os.path.join(default_cache_dir, 'hub'))
    return base_path if model_id is None else os.path.join(
        base_path, model_id + '/')


 def get_endpoint():
@@ -38,6 +42,11 @@ def get_endpoint():
    return MODELSCOPE_URL_SCHEME + modelscope_domain


 def get_dataset_hub_endpoint():
    return os.environ.get('HUB_DATASET_ENDPOINT',
                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)


 def compute_hash(file_path):
    BUFFER_SIZE = 1024 * 64  # 64k buffer size
    sha256_hash = hashlib.sha256()
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -11,6 +11,7 @@ class Models(object):
    """
    # vision models
    detection = 'detection'
    realtime_object_detection = 'realtime-object-detection'
    scrfd = 'scrfd'
    classification_model = 'ClassificationModel'
    nafnet = 'nafnet'
@@ -19,7 +20,18 @@ class Models(object):
    gpen = 'gpen'
    product_retrieval_embedding = 'product-retrieval-embedding'
    body_2d_keypoints = 'body-2d-keypoints'
    body_3d_keypoints = 'body-3d-keypoints'
    crowd_counting = 'HRNetCrowdCounting'
    panoptic_segmentation = 'swinL-panoptic-segmentation'
    image_reid_person = 'passvitb'
    video_summarization = 'pgl-video-summarization'
    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
    resnet50_bert = 'resnet50-bert'

    # EasyCV models
    yolox = 'YOLOX'
    segformer = 'Segformer'

    # nlp models
    bert = 'bert'
@@ -32,8 +44,10 @@ class Models(object):
    space_modeling = 'space-modeling'
    star = 'star'
    tcrf = 'transformer-crf'
    lcrf = 'lstm-crf'
    bart = 'bart'
    gpt3 = 'gpt3'
    bert_for_ds = 'bert-for-document-segmentation'

    # audio models
    sambert_hifigan = 'sambert-hifigan'
@@ -48,12 +62,14 @@ class Models(object):
    gemm = 'gemm-generative-multi-modal'
    mplug = 'mplug'
    diffusion = 'diffusion-text-to-image-synthesis'
    team = 'team-multi-modal-similarity'
    video_clip = 'video-clip-multi-modal-embedding'


 class TaskModels(object):
    # nlp task
    text_classification = 'text-classification'
    information_extraction = 'information-extraction'


 class Heads(object):
@@ -63,6 +79,7 @@ class Heads(object):
    bert_mlm = 'bert-mlm'
    # roberta mlm
    roberta_mlm = 'roberta-mlm'
    information_extraction = 'information-extraction'


 class Pipelines(object):
@@ -84,9 +101,13 @@ class Pipelines(object):
    animal_recognition = 'resnet101-animal-recognition'
    general_recognition = 'resnet101-general-recognition'
    cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
    hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
    body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
    body_3d_keypoints = 'canonical_body-3d-keypoints_video'
    human_detection = 'resnet18-human-detection'
    object_detection = 'vit-object-detection'
    easycv_detection = 'easycv-detection'
    easycv_segmentation = 'easycv-segmentation'
    salient_detection = 'u2net-salient-detection'
    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
@@ -100,6 +121,7 @@ class Pipelines(object):
    image_super_resolution = 'rrdb-image-super-resolution'
    face_image_generation = 'gan-face-image-generation'
    product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
    realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
    face_recognition = 'ir101-face-recognition-cfglint'
    image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
    image2image_translation = 'image-to-image-translation'
@@ -112,6 +134,11 @@ class Pipelines(object):
    tinynas_classification = 'tinynas-classification'
    crowd_counting = 'hrnet-crowd-counting'
    video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
    image_panoptic_segmentation = 'image-panoptic-segmentation'
    video_summarization = 'googlenet_pgl_video_summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
    image_reid_person = 'passvitb-image-reid-person'
    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'

    # nlp tasks
    sentence_similarity = 'sentence-similarity'
@@ -129,7 +156,10 @@ class Pipelines(object):
    dialog_state_tracking = 'dialog-state-tracking'
    zero_shot_classification = 'zero-shot-classification'
    text_error_correction = 'text-error-correction'
    faq_question_answering = 'faq-question-answering'
    conversational_text_to_sql = 'conversational-text-to-sql'
    relation_extraction = 'relation-extraction'
    document_segmentation = 'document-segmentation'

    # audio tasks
    sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -146,8 +176,10 @@ class Pipelines(object):
    visual_question_answering = 'visual-question-answering'
    visual_grounding = 'visual-grounding'
    visual_entailment = 'visual-entailment'
    multi_modal_similarity = 'multi-modal-similarity'
    text_to_image_synthesis = 'text-to-image-synthesis'
    video_multi_modal_embedding = 'video-multi-modal-embedding'
    image_text_retrieval = 'image-text-retrieval'


 class Trainers(object):
@@ -161,6 +193,7 @@ class Trainers(object):
    """

    default = 'trainer'
    easycv = 'easycv'

    # multi-modal trainers
    clip_multi_modal_embedding = 'clip-multi-modal-embedding'
@@ -169,12 +202,17 @@ class Trainers(object):
    # cv trainers
    image_instance_segmentation = 'image-instance-segmentation'
    image_portrait_enhancement = 'image-portrait-enhancement'
    video_summarization = 'video-summarization'
    movie_scene_segmentation = 'movie-scene-segmentation'

    # nlp trainers
    bert_sentiment_analysis = 'bert-sentiment-analysis'
    nlp_base_trainer = 'nlp-base-trainer'
    nlp_veco_trainer = 'nlp-veco-trainer'

    # audio trainers
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'


 class Preprocessors(object):
    """ Names for different preprocessor.
@@ -193,6 +231,8 @@ class Preprocessors(object):
    image_color_enhance_preprocessor = 'image-color-enhance-preprocessor'
    image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
    image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
    video_summarization_preprocessor = 'video-summarization-preprocessor'
    movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'

    # nlp preprocessor
    sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -210,7 +250,10 @@ class Preprocessors(object):
    text_error_correction = 'text-error-correction'
    word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
    fill_mask = 'fill-mask'
    faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
    conversational_text_to_sql = 'conversational-text-to-sql'
    re_tokenizer = 're-tokenizer'
    document_segmentation = 'document-segmentation'

    # audio preprocessor
    linear_aec_fbank = 'linear-aec-fbank'
@@ -229,6 +272,7 @@ class Metrics(object):

    # accuracy
    accuracy = 'accuracy'
    audio_noise_metric = 'audio-noise-metric'

    # metrics for image denoise task
    image_denoise_metric = 'image-denoise-metric'
@@ -245,6 +289,9 @@ class Metrics(object):
    image_color_enhance_metric = 'image-color-enhance-metric'
    # metrics for image-portrait-enhancement task
    image_portrait_enhancement_metric = 'image-portrait-enhancement-metric'
    video_summarization_metric = 'video-summarization-metric'
    # metric for movie-scene-segmentation task
    movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'


 class Optimizers(object):
@@ -294,3 +341,12 @@ class LR_Schedulers(object):
    LinearWarmup = 'LinearWarmup'
    ConstantWarmup = 'ConstantWarmup'
    ExponentialWarmup = 'ExponentialWarmup'


 class Datasets(object):
    """ Names for different datasets.
    """
    ClsDataset = 'ClsDataset'
    SegDataset = 'SegDataset'
    DetDataset = 'DetDataset'
    DetImagesMixDataset = 'DetImagesMixDataset'
--- a/modelscope/metrics/init.py
+++ b/modelscope/metrics/init.py
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .audio_noise_metric import AudioNoiseMetric
    from .base import Metric
    from .builder import METRICS, build_metric, task_default_metrics
    from .image_color_enhance_metric import ImageColorEnhanceMetric
@@ -14,9 +15,12 @@ if TYPE_CHECKING:
    from .sequence_classification_metric import SequenceClassificationMetric
    from .text_generation_metric import TextGenerationMetric
    from .token_classification_metric import TokenClassificationMetric
    from .video_summarization_metric import VideoSummarizationMetric
    from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric

 else:
    _import_structure = {
        'audio_noise_metric': ['AudioNoiseMetric'],
        'base': ['Metric'],
        'builder': ['METRICS', 'build_metric', 'task_default_metrics'],
        'image_color_enhance_metric': ['ImageColorEnhanceMetric'],
@@ -28,6 +32,8 @@ else:
        'sequence_classification_metric': ['SequenceClassificationMetric'],
        'text_generation_metric': ['TextGenerationMetric'],
        'token_classification_metric': ['TokenClassificationMetric'],
        'video_summarization_metric': ['VideoSummarizationMetric'],
        'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
    }

    import sys
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -0,0 +1,38 @@
 from typing import Dict

 from modelscope.metainfo import Metrics
 from modelscope.metrics.base import Metric
 from modelscope.metrics.builder import METRICS, MetricKeys
 from modelscope.utils.registry import default_group


@METRICS.register_module(
    group_key=default_group, module_name=Metrics.audio_noise_metric)
 class AudioNoiseMetric(Metric):
    """
    The metric computation class for acoustic noise suppression task.
    """

    def __init__(self):
        self.loss = []
        self.amp_loss = []
        self.phase_loss = []
        self.sisnr = []

    def add(self, outputs: Dict, inputs: Dict):
        self.loss.append(outputs['loss'].data.cpu())
        self.amp_loss.append(outputs['amp_loss'].data.cpu())
        self.phase_loss.append(outputs['phase_loss'].data.cpu())
        self.sisnr.append(outputs['sisnr'].data.cpu())

    def evaluate(self):
        avg_loss = sum(self.loss) / len(self.loss)
        avg_sisnr = sum(self.sisnr) / len(self.sisnr)
        avg_amp = sum(self.amp_loss) / len(self.amp_loss)
        avg_phase = sum(self.phase_loss) / len(self.phase_loss)
        total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
        return {
            'total_loss': total_loss.item(),
            'avg_sisnr': avg_sisnr.item(),
            MetricKeys.AVERAGE_LOSS: avg_loss.item()
        }
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Dict, Mapping, Union

 from modelscope.metainfo import Metrics
 from modelscope.utils.config import ConfigDict
@@ -15,6 +16,8 @@ class MetricKeys(object):
    RECALL = 'recall'
    PSNR = 'psnr'
    SSIM = 'ssim'
    AVERAGE_LOSS = 'avg_loss'
    FScore = 'fscore'


 task_default_metrics = {
@@ -28,19 +31,26 @@ task_default_metrics = {
    Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
    Tasks.image_portrait_enhancement:
    [Metrics.image_portrait_enhancement_metric],
    Tasks.video_summarization: [Metrics.video_summarization_metric],
    Tasks.image_captioning: [Metrics.text_gen_metric],
    Tasks.visual_question_answering: [Metrics.text_gen_metric],
    Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
 }


 def build_metric(metric_name: str,
 def build_metric(metric_cfg: Union[str, Dict],
                 field: str = default_group,
                 default_args: dict = None):
    """ Build metric given metric_name and field.

    Args:
        metric_name (:obj:`str`): The metric name.
        metric_name (str | dict): The metric name or metric config dict.
        field (str, optional):  The field of this metric, default value: 'default' for all fields.
        default_args (dict, optional): Default initialization arguments.
    """
    cfg = ConfigDict({'type': metric_name})
    if isinstance(metric_cfg, Mapping):
        assert 'type' in metric_cfg
    else:
        metric_cfg = ConfigDict({'type': metric_cfg})
    return build_from_cfg(
        cfg, METRICS, group_key=field, default_args=default_args)
        metric_cfg, METRICS, group_key=field, default_args=default_args)
--- a/modelscope/metrics/movie_scene_segmentation_metric.py
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -0,0 +1,52 @@
 from typing import Dict

 import numpy as np

 from modelscope.metainfo import Metrics
 from modelscope.utils.registry import default_group
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                           torch_nested_numpify)
 from .base import Metric
 from .builder import METRICS, MetricKeys


@METRICS.register_module(
    group_key=default_group,
    module_name=Metrics.movie_scene_segmentation_metric)
 class MovieSceneSegmentationMetric(Metric):
    """The metric computation class for movie scene segmentation classes.
    """

    def __init__(self):
        self.preds = []
        self.labels = []
        self.eps = 1e-5

    def add(self, outputs: Dict, inputs: Dict):
        preds = outputs['pred']
        labels = inputs['label']
        self.preds.extend(preds)
        self.labels.extend(labels)

    def evaluate(self):
        gts = np.array(torch_nested_numpify(torch_nested_detach(self.labels)))
        prob = np.array(torch_nested_numpify(torch_nested_detach(self.preds)))

        gt_one = gts == 1
        gt_zero = gts == 0
        pred_one = prob == 1
        pred_zero = prob == 0

        tp = (gt_one * pred_one).sum()
        fp = (gt_zero * pred_one).sum()
        fn = (gt_one * pred_zero).sum()

        precision = 100.0 * tp / (tp + fp + self.eps)
        recall = 100.0 * tp / (tp + fn + self.eps)
        f1 = 2 * precision * recall / (precision + recall)

        return {
            MetricKeys.F1: f1,
            MetricKeys.RECALL: recall,
            MetricKeys.PRECISION: precision
        }
--- a/modelscope/metrics/video_summarization_metric.py
+++ b/modelscope/metrics/video_summarization_metric.py
@@ -0,0 +1,78 @@
 from typing import Dict

 import numpy as np

 from modelscope.metainfo import Metrics
 from modelscope.models.cv.video_summarization.summarizer import \
    generate_summary
 from modelscope.utils.registry import default_group
 from .base import Metric
 from .builder import METRICS, MetricKeys


 def evaluate_summary(predicted_summary, user_summary, eval_method):
    """ Compare the predicted summary with the user defined one(s).

    :param ndarray predicted_summary: The generated summary from our model.
    :param ndarray user_summary: The user defined ground truth summaries (or summary).
    :param str eval_method: The proposed evaluation method; either 'max' (SumMe) or 'avg' (TVSum).
    :return: The reduced fscore based on the eval_method
    """
    max_len = max(len(predicted_summary), user_summary.shape[1])
    S = np.zeros(max_len, dtype=int)
    G = np.zeros(max_len, dtype=int)
    S[:len(predicted_summary)] = predicted_summary

    f_scores = []
    for user in range(user_summary.shape[0]):
        G[:user_summary.shape[1]] = user_summary[user]
        overlapped = S & G

        # Compute precision, recall, f-score
        precision = sum(overlapped) / sum(S)
        recall = sum(overlapped) / sum(G)
        if precision + recall == 0:
            f_scores.append(0)
        else:
            f_score = 2 * precision * recall * 100 / (precision + recall)
            f_scores.append(f_score)

    if eval_method == 'max':
        return max(f_scores)
    else:
        return sum(f_scores) / len(f_scores)


 def calculate_f_score(outputs: Dict, inputs: Dict):
    scores = outputs['scores']
    scores = scores.squeeze(0).cpu().numpy().tolist()
    user_summary = inputs['user_summary'].cpu().numpy()[0]
    sb = inputs['change_points'].cpu().numpy()[0]
    n_frames = inputs['n_frames'].cpu().numpy()[0]
    positions = inputs['positions'].cpu().numpy()[0]
    summary = generate_summary([sb], [scores], [n_frames], [positions])[0]
    f_score = evaluate_summary(summary, user_summary, 'avg')
    return f_score


@METRICS.register_module(
    group_key=default_group, module_name=Metrics.video_summarization_metric)
 class VideoSummarizationMetric(Metric):
    """The metric for video summarization task.
    """

    def __init__(self):
        self.inputs = []
        self.outputs = []

    def add(self, outputs: Dict, inputs: Dict):
        self.outputs.append(outputs)
        self.inputs.append(inputs)

    def evaluate(self):
        f_scores = [
            calculate_f_score(output, input)
            for output, input in zip(self.outputs, self.inputs)
        ]

        return {MetricKeys.FScore: sum(f_scores) / len(f_scores)}
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -75,27 +75,37 @@ class FRCRNModel(TorchModel):
        model_bin_file = os.path.join(model_dir,
                                      ModelFile.TORCH_MODEL_BIN_FILE)
        if os.path.exists(model_bin_file):
            checkpoint = torch.load(model_bin_file)
            self.model.load_state_dict(checkpoint, strict=False)
            checkpoint = torch.load(
                model_bin_file, map_location=torch.device('cpu'))
            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
                self.model.load_state_dict(
                    checkpoint['state_dict'], strict=False)
            else:
                self.model.load_state_dict(checkpoint, strict=False)

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        output = self.model.forward(input)
        return {
            'spec_l1': output[0],
            'wav_l1': output[1],
            'mask_l1': output[2],
            'spec_l2': output[3],
            'wav_l2': output[4],
            'mask_l2': output[5]
        result_list = self.model.forward(input['noisy'])
        output = {
            'spec_l1': result_list[0],
            'wav_l1': result_list[1],
            'mask_l1': result_list[2],
            'spec_l2': result_list[3],
            'wav_l2': result_list[4],
            'mask_l2': result_list[5]
        }

    def to(self, *args, **kwargs):
        self.model = self.model.to(*args, **kwargs)
        return self

    def eval(self):
        self.model = self.model.train(False)
        return self
        if 'clean' in input:
            mix_result = self.model.loss(
                input['noisy'], input['clean'], result_list, mode='Mix')
            output.update(mix_result)
            sisnr_result = self.model.loss(
                input['noisy'], input['clean'], result_list, mode='SiSNR')
            output.update(sisnr_result)
            # logger hooker will use items under 'log_vars'
            output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
            output['log_vars'].update(
                {k: sisnr_result[k].item()
                 for k in sisnr_result})
        return output


 class FRCRN(nn.Module):
@@ -110,7 +120,8 @@ class FRCRN(nn.Module):
                 win_len=400,
                 win_inc=100,
                 fft_len=512,
                 win_type='hanning'):
                 win_type='hanning',
                 **kwargs):
        r"""
        Args:
            complex: Whether to use complex networks.
@@ -236,7 +247,7 @@ class FRCRN(nn.Module):
                if count != 3:
                    loss = self.loss_1layer(noisy, est_spec, est_wav, labels,
                                            est_mask, mode)
            return loss
            return dict(sisnr=loss)

        elif mode == 'Mix':
            count = 0
@@ -251,7 +262,7 @@ class FRCRN(nn.Module):
                    amp_loss, phase_loss, SiSNR_loss = self.loss_1layer(
                        noisy, est_spec, est_wav, labels, est_mask, mode)
                    loss = amp_loss + phase_loss + SiSNR_loss
            return loss, amp_loss, phase_loss
            return dict(loss=loss, amp_loss=amp_loss, phase_loss=phase_loss)

    def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'):
        r""" Compute the loss by mode
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -33,6 +33,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
                                      ModelFile.TORCH_MODEL_BIN_FILE)
        self._model = None
        if os.path.exists(model_bin_file):
            kwargs.pop('device')
            self._model = FSMNSeleNetV2(*args, **kwargs)
            checkpoint = torch.load(model_bin_file)
            self._model.load_state_dict(checkpoint, strict=False)
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -1,15 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Dict, Optional, Union

 import numpy as np
 from typing import Callable, Dict, List, Optional, Union

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
 from modelscope.utils.checkpoint import save_pretrained
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
 from modelscope.utils.device import device_placement, verify_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
@@ -24,8 +24,7 @@ class Model(ABC):
    def __init__(self, model_dir, *args, **kwargs):
        self.model_dir = model_dir
        device_name = kwargs.get('device', 'gpu')
        assert device_name in ['gpu',
                               'cpu'], 'device should be either cpu or gpu.'
        verify_device(device_name)
        self._device_name = device_name

    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
@@ -72,6 +71,7 @@ class Model(ABC):
                        model_name_or_path: str,
                        revision: Optional[str] = DEFAULT_MODEL_REVISION,
                        cfg_dict: Config = None,
                        device: str = None,
                        *model_args,
                        **kwargs):
        """ Instantiate a model from local directory or remote model repo. Note
@@ -97,7 +97,7 @@ class Model(ABC):
                osp.join(local_model_dir, ModelFile.CONFIGURATION))
        task_name = cfg.task
        model_cfg = cfg.model
        # TODO @wenmeng.zwm may should manually initialize model after model building
        framework = cfg.framework

        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
            model_cfg.type = model_cfg.model_type
@@ -105,10 +105,41 @@ class Model(ABC):
        model_cfg.model_dir = local_model_dir
        for k, v in kwargs.items():
            model_cfg[k] = v
        model = build_model(
            model_cfg, task_name=task_name, default_args=kwargs)
        if device is not None:
            model_cfg.device = device
            with device_placement(framework, device):
                model = build_model(
                    model_cfg, task_name=task_name, default_args=kwargs)
        else:
            model = build_model(
                model_cfg, task_name=task_name, default_args=kwargs)

        # dynamically add pipeline info to model for pipeline inference
        if hasattr(cfg, 'pipeline'):
            model.pipeline = cfg.pipeline
        return model

    def save_pretrained(self,
                        target_folder: Union[str, os.PathLike],
                        save_checkpoint_names: Union[str, List[str]] = None,
                        save_function: Callable = None,
                        config: Optional[dict] = None,
                        **kwargs):
        """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded

        Args:
            target_folder (Union[str, os.PathLike]):
            Directory to which to save. Will be created if it doesn't exist.

            save_checkpoint_names (Union[str, List[str]]):
            The checkpoint names to be saved in the target_folder

            save_function (Callable, optional):
            The function to use to save the state dictionary.

            config (Optional[dict], optional):
            The config for the configuration.json, might not be identical with model.config

        """
        save_pretrained(self, target_folder, save_checkpoint_names,
                        save_function, config, **kwargs)
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -1,9 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 # yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
               cartoon, cmdssl_video_embedding, crowd_counting, face_detection,
               face_generation, image_classification, image_color_enhance,
               image_colorization, image_denoise, image_instance_segmentation,
               image_portrait_enhancement, image_to_image_generation,
               image_to_image_translation, object_detection,
               product_retrieval_embedding, salient_detection,
               super_resolution, video_single_object_tracking, virual_tryon)
               body_3d_keypoints, cartoon, cmdssl_video_embedding,
               crowd_counting, face_detection, face_generation,
               image_classification, image_color_enhance, image_colorization,
               image_denoise, image_instance_segmentation,
               image_panoptic_segmentation, image_portrait_enhancement,
               image_reid_person, image_semantic_segmentation,
               image_to_image_generation, image_to_image_translation,
               movie_scene_segmentation, object_detection,
               product_retrieval_embedding, realtime_object_detection,
               salient_detection, super_resolution,
               video_single_object_tracking, video_summarization, virual_tryon)

 # yapf: enable
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -1,5 +1,6 @@
 import torch.nn as nn

 from .s3dg import Inception3D
 from .tada_convnext import TadaConvNeXt


@@ -26,11 +27,25 @@ class BaseVideoModel(nn.Module):
        super(BaseVideoModel, self).__init__()
        # the backbone is created according to meta-architectures
        # defined in models/base/backbone.py
        self.backbone = TadaConvNeXt(cfg)
        if cfg.MODEL.NAME == 'ConvNeXt_tiny':
            self.backbone = TadaConvNeXt(cfg)
        elif cfg.MODEL.NAME == 'S3DG':
            self.backbone = Inception3D(cfg)
        else:
            error_str = 'backbone {} is not supported, ConvNeXt_tiny or S3DG is supported'.format(
                cfg.MODEL.NAME)
            raise NotImplementedError(error_str)

        # the head is created according to the heads
        # defined in models/module_zoo/heads
        self.head = BaseHead(cfg)
        if cfg.VIDEO.HEAD.NAME == 'BaseHead':
            self.head = BaseHead(cfg)
        elif cfg.VIDEO.HEAD.NAME == 'AvgHead':
            self.head = AvgHead(cfg)
        else:
            error_str = 'head {} is not supported, BaseHead or AvgHead is supported'.format(
                cfg.VIDEO.HEAD.NAME)
            raise NotImplementedError(error_str)

    def forward(self, x):
        x = self.backbone(x)
@@ -88,3 +103,29 @@ class BaseHead(nn.Module):
        out = self.activation(out)
        out = out.view(out.shape[0], -1)
        return out, x.view(x.shape[0], -1)


 class AvgHead(nn.Module):
    """
    Constructs base head.
    """

    def __init__(
        self,
        cfg,
    ):
        """
        Args:
            cfg (Config): global config object.
        """
        super(AvgHead, self).__init__()
        self.cfg = cfg
        self.global_avg_pool = nn.AdaptiveAvgPool3d(1)

    def forward(self, x):
        if len(x.shape) == 5:
            x = self.global_avg_pool(x)
            # (N, C, T, H, W) -> (N, T, H, W, C).
            x = x.permute((0, 2, 3, 4, 1))
        out = x.view(x.shape[0], -1)
        return out, x.view(x.shape[0], -1)
--- a/modelscope/models/cv/action_recognition/s3dg.py
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -0,0 +1,301 @@
 import torch
 import torch.nn as nn


 class InceptionBaseConv3D(nn.Module):
    """
    Constructs basic inception 3D conv.
    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
    """

    def __init__(self,
                 cfg,
                 in_planes,
                 out_planes,
                 kernel_size,
                 stride,
                 padding=0):
        super(InceptionBaseConv3D, self).__init__()
        self.conv = nn.Conv3d(
            in_planes,
            out_planes,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            bias=False)
        self.bn = nn.BatchNorm3d(out_planes)
        self.relu = nn.ReLU(inplace=True)

        # init
        self.conv.weight.data.normal_(
            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
        self.bn.weight.data.fill_(1)
        self.bn.bias.data.zero_()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x


 class InceptionBlock3D(nn.Module):
    """
    Element constructing the S3D/S3DG.
    See models/base/backbone.py L99-186.

    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
    """

    def __init__(self, cfg, in_planes, out_planes):
        super(InceptionBlock3D, self).__init__()

        _gating = cfg.VIDEO.BACKBONE.BRANCH.GATING

        assert len(out_planes) == 6
        assert isinstance(out_planes, list)

        [
            num_out_0_0a, num_out_1_0a, num_out_1_0b, num_out_2_0a,
            num_out_2_0b, num_out_3_0b
        ] = out_planes

        self.branch0 = nn.Sequential(
            InceptionBaseConv3D(
                cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), )
        self.branch1 = nn.Sequential(
            InceptionBaseConv3D(
                cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1),
            STConv3d(
                cfg,
                num_out_1_0a,
                num_out_1_0b,
                kernel_size=3,
                stride=1,
                padding=1),
        )
        self.branch2 = nn.Sequential(
            InceptionBaseConv3D(
                cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1),
            STConv3d(
                cfg,
                num_out_2_0a,
                num_out_2_0b,
                kernel_size=3,
                stride=1,
                padding=1),
        )
        self.branch3 = nn.Sequential(
            nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
            InceptionBaseConv3D(
                cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1),
        )

        self.out_channels = sum(
            [num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b])

        self.gating = _gating
        if _gating:
            self.gating_b0 = SelfGating(num_out_0_0a)
            self.gating_b1 = SelfGating(num_out_1_0b)
            self.gating_b2 = SelfGating(num_out_2_0b)
            self.gating_b3 = SelfGating(num_out_3_0b)

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        if self.gating:
            x0 = self.gating_b0(x0)
            x1 = self.gating_b1(x1)
            x2 = self.gating_b2(x2)
            x3 = self.gating_b3(x3)

        out = torch.cat((x0, x1, x2, x3), 1)

        return out


 class SelfGating(nn.Module):

    def __init__(self, input_dim):
        super(SelfGating, self).__init__()
        self.fc = nn.Linear(input_dim, input_dim)

    def forward(self, input_tensor):
        """Feature gating as used in S3D-G"""
        spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4])
        weights = self.fc(spatiotemporal_average)
        weights = torch.sigmoid(weights)
        return weights[:, :, None, None, None] * input_tensor


 class STConv3d(nn.Module):
    """
    Element constructing the S3D/S3DG.
    See models/base/backbone.py L99-186.

    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
    """

    def __init__(self,
                 cfg,
                 in_planes,
                 out_planes,
                 kernel_size,
                 stride,
                 padding=0):
        super(STConv3d, self).__init__()
        if isinstance(stride, tuple):
            t_stride = stride[0]
            stride = stride[-1]
        else:  # int
            t_stride = stride

        self.bn_mmt = cfg.BN.MOMENTUM
        self.bn_eps = float(cfg.BN.EPS)
        self._construct_branch(cfg, in_planes, out_planes, kernel_size, stride,
                               t_stride, padding)

    def _construct_branch(self,
                          cfg,
                          in_planes,
                          out_planes,
                          kernel_size,
                          stride,
                          t_stride,
                          padding=0):
        self.conv1 = nn.Conv3d(
            in_planes,
            out_planes,
            kernel_size=(1, kernel_size, kernel_size),
            stride=(1, stride, stride),
            padding=(0, padding, padding),
            bias=False)
        self.conv2 = nn.Conv3d(
            out_planes,
            out_planes,
            kernel_size=(kernel_size, 1, 1),
            stride=(t_stride, 1, 1),
            padding=(padding, 0, 0),
            bias=False)

        self.bn1 = nn.BatchNorm3d(
            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
        self.bn2 = nn.BatchNorm3d(
            out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
        self.relu = nn.ReLU(inplace=True)

        # init
        self.conv1.weight.data.normal_(
            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
        self.conv2.weight.data.normal_(
            mean=0, std=0.01)  # original s3d is truncated normal within 2 std
        self.bn1.weight.data.fill_(1)
        self.bn1.bias.data.zero_()
        self.bn2.weight.data.fill_(1)
        self.bn2.bias.data.zero_()

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        return x


 class Inception3D(nn.Module):
    """
    Backbone architecture for I3D/S3DG.
    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
    """

    def __init__(self, cfg):
        """
        Args:
            cfg (Config): global config object.
        """
        super(Inception3D, self).__init__()
        _input_channel = cfg.DATA.NUM_INPUT_CHANNELS
        self._construct_backbone(cfg, _input_channel)

    def _construct_backbone(self, cfg, input_channel):
        # ------------------- Block 1 -------------------
        self.Conv_1a = STConv3d(
            cfg, input_channel, 64, kernel_size=7, stride=2, padding=3)

        self.block1 = nn.Sequential(self.Conv_1a)  # (64, 32, 112, 112)

        # ------------------- Block 2 -------------------
        self.MaxPool_2a = nn.MaxPool3d(
            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        self.Conv_2b = InceptionBaseConv3D(
            cfg, 64, 64, kernel_size=1, stride=1)
        self.Conv_2c = STConv3d(
            cfg, 64, 192, kernel_size=3, stride=1, padding=1)

        self.block2 = nn.Sequential(
            self.MaxPool_2a,  # (64, 32, 56, 56)
            self.Conv_2b,  # (64, 32, 56, 56)
            self.Conv_2c)  # (192, 32, 56, 56)

        # ------------------- Block 3 -------------------
        self.MaxPool_3a = nn.MaxPool3d(
            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        self.Mixed_3b = InceptionBlock3D(
            cfg, in_planes=192, out_planes=[64, 96, 128, 16, 32, 32])
        self.Mixed_3c = InceptionBlock3D(
            cfg, in_planes=256, out_planes=[128, 128, 192, 32, 96, 64])

        self.block3 = nn.Sequential(
            self.MaxPool_3a,  # (192, 32, 28, 28)
            self.Mixed_3b,  # (256, 32, 28, 28)
            self.Mixed_3c)  # (480, 32, 28, 28)

        # ------------------- Block 4 -------------------
        self.MaxPool_4a = nn.MaxPool3d(
            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
        self.Mixed_4b = InceptionBlock3D(
            cfg, in_planes=480, out_planes=[192, 96, 208, 16, 48, 64])
        self.Mixed_4c = InceptionBlock3D(
            cfg, in_planes=512, out_planes=[160, 112, 224, 24, 64, 64])
        self.Mixed_4d = InceptionBlock3D(
            cfg, in_planes=512, out_planes=[128, 128, 256, 24, 64, 64])
        self.Mixed_4e = InceptionBlock3D(
            cfg, in_planes=512, out_planes=[112, 144, 288, 32, 64, 64])
        self.Mixed_4f = InceptionBlock3D(
            cfg, in_planes=528, out_planes=[256, 160, 320, 32, 128, 128])

        self.block4 = nn.Sequential(
            self.MaxPool_4a,  # (480, 16, 14, 14)
            self.Mixed_4b,  # (512, 16, 14, 14)
            self.Mixed_4c,  # (512, 16, 14, 14)
            self.Mixed_4d,  # (512, 16, 14, 14)
            self.Mixed_4e,  # (528, 16, 14, 14)
            self.Mixed_4f)  # (832, 16, 14, 14)

        # ------------------- Block 5 -------------------
        self.MaxPool_5a = nn.MaxPool3d(
            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
        self.Mixed_5b = InceptionBlock3D(
            cfg, in_planes=832, out_planes=[256, 160, 320, 32, 128, 128])
        self.Mixed_5c = InceptionBlock3D(
            cfg, in_planes=832, out_planes=[384, 192, 384, 48, 128, 128])

        self.block5 = nn.Sequential(
            self.MaxPool_5a,  # (832, 8, 7, 7)
            self.Mixed_5b,  # (832, 8, 7, 7)
            self.Mixed_5c)  # (1024, 8, 7, 7)

    def forward(self, x):
        if isinstance(x, dict):
            x = x['video']
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        return x
--- a/modelscope/models/cv/body_3d_keypoints/init.py
+++ b/modelscope/models/cv/body_3d_keypoints/init.py
@@ -0,0 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:

    from .body_3d_pose import BodyKeypointsDetection3D

 else:
    _import_structure = {
        'body_3d_pose': ['BodyKeypointsDetection3D'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -0,0 +1,246 @@
 import logging
 import os.path as osp
 from typing import Any, Dict, List, Union

 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import (
    TemporalModel, TransCan3Dkeys)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 __all__ = ['BodyKeypointsDetection3D']


 class KeypointsTypes(object):
    POSES_CAMERA = 'poses_camera'
    POSES_TRAJ = 'poses_traj'


@MODELS.register_module(
    Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints)
 class BodyKeypointsDetection3D(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):

        super().__init__(model_dir, *args, **kwargs)

        self.model_dir = model_dir
        model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
        cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION)
        self.cfg = Config.from_file(cfg_path)
        self._create_model()

        if not osp.exists(model_path):
            raise IOError(f'{model_path} is not exists.')

        if torch.cuda.is_available():
            self._device = torch.device('cuda')
        else:
            self._device = torch.device('cpu')
        self.pretrained_state_dict = torch.load(
            model_path, map_location=self._device)

        self.load_pretrained()
        self.to_device(self._device)
        self.eval()

    def _create_model(self):
        self.model_pos = TemporalModel(
            self.cfg.model.MODEL.IN_NUM_JOINTS,
            self.cfg.model.MODEL.IN_2D_FEATURE,
            self.cfg.model.MODEL.OUT_NUM_JOINTS,
            filter_widths=self.cfg.model.MODEL.FILTER_WIDTHS,
            causal=self.cfg.model.MODEL.CAUSAL,
            dropout=self.cfg.model.MODEL.DROPOUT,
            channels=self.cfg.model.MODEL.CHANNELS,
            dense=self.cfg.model.MODEL.DENSE)

        receptive_field = self.model_pos.receptive_field()
        self.pad = (receptive_field - 1) // 2
        if self.cfg.model.MODEL.CAUSAL:
            self.causal_shift = self.pad
        else:
            self.causal_shift = 0

        self.model_traj = TransCan3Dkeys(
            in_channels=self.cfg.model.MODEL.IN_NUM_JOINTS
            * self.cfg.model.MODEL.IN_2D_FEATURE,
            num_features=1024,
            out_channels=self.cfg.model.MODEL.OUT_3D_FEATURE,
            num_blocks=4,
            time_window=receptive_field)

    def eval(self):
        self.model_pos.eval()
        self.model_traj.eval()

    def train(self):
        self.model_pos.train()
        self.model_traj.train()

    def to_device(self, device):
        self.model_pos = self.model_pos.to(device)
        self.model_traj = self.model_traj.to(device)

    def load_pretrained(self):
        if 'model_pos' in self.pretrained_state_dict:
            self.model_pos.load_state_dict(
                self.pretrained_state_dict['model_pos'], strict=False)
        else:
            logging.error(
                'Not load model pos from pretrained_state_dict, not in pretrained_state_dict'
            )

        if 'model_traj' in self.pretrained_state_dict:
            self.model_traj.load_state_dict(
                self.pretrained_state_dict['model_traj'], strict=False)
        else:
            logging.error(
                'Not load model traj from pretrained_state_dict, not in pretrained_state_dict'
            )
        logging.info('Load pretrained model done.')

    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
        """Proprocess of 2D input joints.

        Args:
            input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.

        Returns:
            Dict[str, Any]: canonical 2d points and root relative joints.
        """
        if 'cuda' == input.device.type:
            input = input.data.cpu().numpy()
        elif 'cpu' == input.device.type:
            input = input.data.numpy()
        pose2d = input

        pose2d_canonical = self.canonicalize_2Ds(
            pose2d, self.cfg.model.INPUT.FOCAL_LENGTH,
            self.cfg.model.INPUT.CENTER)
        pose2d_normalized = self.normalize_screen_coordinates(
            pose2d, self.cfg.model.INPUT.RES_W, self.cfg.model.INPUT.RES_H)
        pose2d_rr = pose2d_normalized
        pose2d_rr[:, 1:] -= pose2d_rr[:, :1]

        # expand [NUM_FRAME, NUM_JOINTS, 2] to [1, NUM_FRAME, NUM_JOINTS, 2]
        pose2d_rr = np.expand_dims(
            np.pad(
                pose2d_rr,
                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
                 (0, 0), (0, 0)), 'edge'),
            axis=0)
        pose2d_canonical = np.expand_dims(
            np.pad(
                pose2d_canonical,
                ((self.pad + self.causal_shift, self.pad - self.causal_shift),
                 (0, 0), (0, 0)), 'edge'),
            axis=0)
        pose2d_rr = torch.from_numpy(pose2d_rr.astype(np.float32))
        pose2d_canonical = torch.from_numpy(
            pose2d_canonical.astype(np.float32))

        inputs_2d = pose2d_rr.clone()
        if torch.cuda.is_available():
            inputs_2d = inputs_2d.cuda(non_blocking=True)

        # Positional model
        if self.cfg.model.MODEL.USE_2D_OFFSETS:
            inputs_2d[:, :, 0] = 0
        else:
            inputs_2d[:, :, 1:] += inputs_2d[:, :, :1]

        return {
            'inputs_2d': inputs_2d,
            'pose2d_rr': pose2d_rr,
            'pose2d_canonical': pose2d_canonical
        }

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        """3D human pose estimation.

        Args:
            input (Dict):
                inputs_2d:  [1, NUM_FRAME, NUM_JOINTS, 2]
                pose2d_rr:  [1, NUM_FRAME, NUM_JOINTS, 2]
                pose2d_canonical: [1, NUM_FRAME, NUM_JOINTS, 2]
                NUM_FRAME = max(receptive_filed + video_frame_number, video_frame_number)

        Returns:
            Dict[str, Any]:
                "camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM],
                    3D human pose keypoints in camera frame.
                "camera_traj": Tensor, [1, NUM_FRAME, 1, 3],
                    root keypoints coordinates in camere frame.
        """
        inputs_2d = input['inputs_2d']
        pose2d_rr = input['pose2d_rr']
        pose2d_canonical = input['pose2d_canonical']
        with torch.no_grad():
            # predict 3D pose keypoints
            predicted_3d_pos = self.model_pos(inputs_2d)

            # predict global trajectory
            b1, w1, n1, d1 = inputs_2d.shape

            input_pose2d_abs = self.get_abs_2d_pts(w1, pose2d_rr,
                                                   pose2d_canonical)
            b1, w1, n1, d1 = input_pose2d_abs.size()
            b2, w2, n2, d2 = predicted_3d_pos.size()

            if torch.cuda.is_available():
                input_pose2d_abs = input_pose2d_abs.cuda(non_blocking=True)

            predicted_3d_traj = self.model_traj(
                input_pose2d_abs.view(b1, w1, n1 * d1),
                predicted_3d_pos.view(b2 * w2, n2 * d2)).view(b2, w2, -1, 3)

            predict_dict = {
                KeypointsTypes.POSES_CAMERA: predicted_3d_pos,
                KeypointsTypes.POSES_TRAJ: predicted_3d_traj
            }

        return predict_dict

    def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
                       pose2d_canonical):
        pad = self.pad
        w = input_video_frame_num - pad * 2

        lst_pose2d_rr = []
        lst_pose2d_cannoical = []
        for i in range(pad, w + pad):
            lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1])
            lst_pose2d_cannoical.append(pose2d_canonical[:,
                                                         i - pad:i + pad + 1])

        input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
        input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)

        if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
            input_pose2d_abs = input_pose2d_cannoical.clone()
        else:
            input_pose2d_abs = input_pose2d_rr.clone()
            input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1]

        return input_pose2d_abs

    def canonicalize_2Ds(self, pos2d, f, c):
        cs = np.array([c[0], c[1]]).reshape(1, 1, 2)
        fs = np.array([f[0], f[1]]).reshape(1, 1, 2)
        canoical_2Ds = (pos2d - cs) / fs
        return canoical_2Ds

    def normalize_screen_coordinates(self, X, w, h):
        assert X.shape[-1] == 2

        # Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
        return X / w * 2 - [1, h / w]
--- a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
@@ -0,0 +1,233 @@
 # The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
 import torch
 import torch.nn as nn


 class TemporalModelBase(nn.Module):
    """
    Do not instantiate this class.
    """

    def __init__(self, num_joints_in, in_features, num_joints_out,
                 filter_widths, causal, dropout, channels):
        super().__init__()

        # Validate input
        for fw in filter_widths:
            assert fw % 2 != 0, 'Only odd filter widths are supported'

        self.num_joints_in = num_joints_in
        self.in_features = in_features
        self.num_joints_out = num_joints_out
        self.filter_widths = filter_widths

        self.drop = nn.Dropout(dropout)
        self.relu = nn.ReLU(inplace=True)

        self.pad = [filter_widths[0] // 2]
        self.expand_bn = nn.BatchNorm1d(channels, momentum=0.1)
        self.shrink = nn.Conv1d(channels, num_joints_out * 3, 1)

    def set_bn_momentum(self, momentum):
        self.expand_bn.momentum = momentum
        for bn in self.layers_bn:
            bn.momentum = momentum

    def receptive_field(self):
        """
        Return the total receptive field of this model as # of frames.
        """
        frames = 0
        for f in self.pad:
            frames += f
        return 1 + 2 * frames

    def total_causal_shift(self):
        """
        Return the asymmetric offset for sequence padding.
        The returned value is typically 0 if causal convolutions are disabled,
        otherwise it is half the receptive field.
        """
        frames = self.causal_shift[0]
        next_dilation = self.filter_widths[0]
        for i in range(1, len(self.filter_widths)):
            frames += self.causal_shift[i] * next_dilation
            next_dilation *= self.filter_widths[i]
        return frames

    def forward(self, x):
        assert len(x.shape) == 4
        assert x.shape[-2] == self.num_joints_in
        assert x.shape[-1] == self.in_features

        sz = x.shape[:3]
        x = x.view(x.shape[0], x.shape[1], -1)
        x = x.permute(0, 2, 1)

        x = self._forward_blocks(x)

        x = x.permute(0, 2, 1)
        x = x.view(sz[0], -1, self.num_joints_out, 3)

        return x


 class TemporalModel(TemporalModelBase):
    """
    Reference 3D pose estimation model with temporal convolutions.
    This implementation can be used for all use-cases.
    """

    def __init__(self,
                 num_joints_in,
                 in_features,
                 num_joints_out,
                 filter_widths,
                 causal=False,
                 dropout=0.25,
                 channels=1024,
                 dense=False):
        """
        Initialize this model.

        Arguments:
        num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
        in_features -- number of input features for each joint (typically 2 for 2D input)
        num_joints_out -- number of output joints (can be different than input)
        filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
        causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
        dropout -- dropout probability
        channels -- number of convolution channels
        dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment)
        """
        super().__init__(num_joints_in, in_features, num_joints_out,
                         filter_widths, causal, dropout, channels)

        self.expand_conv = nn.Conv1d(
            num_joints_in * in_features,
            channels,
            filter_widths[0],
            bias=False)

        layers_conv = []
        layers_bn = []

        self.causal_shift = [(filter_widths[0]) // 2 if causal else 0]
        next_dilation = filter_widths[0]
        for i in range(1, len(filter_widths)):
            self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
            self.causal_shift.append((filter_widths[i] // 2
                                      * next_dilation) if causal else 0)

            layers_conv.append(
                nn.Conv1d(
                    channels,
                    channels,
                    filter_widths[i] if not dense else (2 * self.pad[-1] + 1),
                    dilation=next_dilation if not dense else 1,
                    bias=False))
            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
            layers_conv.append(
                nn.Conv1d(channels, channels, 1, dilation=1, bias=False))
            layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))

            next_dilation *= filter_widths[i]

        self.layers_conv = nn.ModuleList(layers_conv)
        self.layers_bn = nn.ModuleList(layers_bn)

    def _forward_blocks(self, x):
        x = self.drop(self.relu(self.expand_bn(self.expand_conv(x))))
        for i in range(len(self.pad) - 1):
            pad = self.pad[i + 1]
            shift = self.causal_shift[i + 1]
            res = x[:, :, pad + shift:x.shape[2] - pad + shift]
            x = self.drop(
                self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))))
            x = res + self.drop(
                self.relu(self.layers_bn[2 * i + 1](
                    self.layers_conv[2 * i + 1](x))))

        x = self.shrink(x)
        return x


 # regression of the trajectory
 class TransCan3Dkeys(nn.Module):

    def __init__(self,
                 in_channels=74,
                 num_features=256,
                 out_channels=44,
                 time_window=10,
                 num_blocks=2):
        super().__init__()
        self.in_channels = in_channels
        self.num_features = num_features
        self.out_channels = out_channels
        self.num_blocks = num_blocks
        self.time_window = time_window

        self.expand_bn = nn.BatchNorm1d(self.num_features, momentum=0.1)
        self.conv1 = nn.Sequential(
            nn.ReplicationPad1d(1),
            nn.Conv1d(
                self.in_channels, self.num_features, kernel_size=3,
                bias=False), self.expand_bn, nn.ReLU(inplace=True),
            nn.Dropout(p=0.25))
        self._make_blocks()
        self.pad = nn.ReplicationPad1d(4)
        self.relu = nn.ReLU(inplace=True)
        self.drop = nn.Dropout(p=0.25)
        self.reduce = nn.Conv1d(
            self.num_features, self.num_features, kernel_size=self.time_window)
        self.embedding_3d_1 = nn.Linear(in_channels // 2 * 3, 500)
        self.embedding_3d_2 = nn.Linear(500, 500)
        self.LReLU1 = nn.LeakyReLU()
        self.LReLU2 = nn.LeakyReLU()
        self.LReLU3 = nn.LeakyReLU()
        self.out1 = nn.Linear(self.num_features + 500, self.num_features)
        self.out2 = nn.Linear(self.num_features, self.out_channels)

    def _make_blocks(self):
        layers_conv = []
        layers_bn = []
        for i in range(self.num_blocks):
            layers_conv.append(
                nn.Conv1d(
                    self.num_features,
                    self.num_features,
                    kernel_size=5,
                    bias=False,
                    dilation=2))
            layers_bn.append(nn.BatchNorm1d(self.num_features))
        self.layers_conv = nn.ModuleList(layers_conv)
        self.layers_bn = nn.ModuleList(layers_bn)

    def set_bn_momentum(self, momentum):
        self.expand_bn.momentum = momentum
        for bn in self.layers_bn:
            bn.momentum = momentum

    def forward(self, p2ds, p3d):
        """
        Args:
        x - (B x T x J x C)
        """
        B, T, C = p2ds.shape
        x = p2ds.permute((0, 2, 1))
        x = self.conv1(x)
        for i in range(self.num_blocks):
            pre = x
            x = self.pad(x)
            x = self.layers_conv[i](x)
            x = self.layers_bn[i](x)
            x = self.drop(self.relu(x))
            x = pre + x
        x_2d = self.relu(self.reduce(x))
        x_2d = x_2d.view(B, -1)
        x_3d = self.LReLU1(self.embedding_3d_1(p3d))
        x = torch.cat((x_2d, x_3d), 1)
        x = self.LReLU3(self.out1(x))
        x = self.out2(x)
        return x
--- a/modelscope/models/cv/crowd_counting/cc_model.py
+++ b/modelscope/models/cv/crowd_counting/cc_model.py
@@ -13,8 +13,8 @@ from modelscope.utils.constant import Tasks
    Tasks.crowd_counting, module_name=Models.crowd_counting)
 class HRNetCrowdCounting(TorchModel):

    def __init__(self, model_dir: str):
        super().__init__(model_dir)
    def __init__(self, model_dir: str, **kwargs):
        super().__init__(model_dir, **kwargs)

        from .hrnet_aspp_relu import HighResolutionNet as HRNet_aspp_relu

--- a/modelscope/models/cv/easycv_base.py
+++ b/modelscope/models/cv/easycv_base.py
@@ -0,0 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.base import BaseModel
 from easycv.utils.ms_utils import EasyCVMeta

 from modelscope.models.base import TorchModel


 class EasyCVBaseModel(BaseModel, TorchModel):
    """Base model for EasyCV."""

    def __init__(self, model_dir=None, args=(), kwargs={}):
        kwargs.pop(EasyCVMeta.ARCH, None)  # pop useless keys
        BaseModel.__init__(self)
        TorchModel.__init__(self, model_dir=model_dir)

    def forward(self, img, mode='train', **kwargs):
        if self.training:
            losses = self.forward_train(img, **kwargs)
            loss, log_vars = self._parse_losses(losses)
            return dict(loss=loss, log_vars=log_vars)
        else:
            return self.forward_test(img, **kwargs)

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
--- a/modelscope/models/cv/image_classification/mmcls_model.py
+++ b/modelscope/models/cv/image_classification/mmcls_model.py
@@ -10,7 +10,7 @@ from modelscope.utils.constant import Tasks
    Tasks.image_classification, module_name=Models.classification_model)
 class ClassificationModel(TorchModel):

    def __init__(self, model_dir: str):
    def __init__(self, model_dir: str, **kwargs):
        import mmcv
        from mmcls.models import build_classifier

--- a/modelscope/models/cv/image_panoptic_segmentation/init.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/init.py
@@ -0,0 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .panseg_model import SwinLPanopticSegmentation

 else:
    _import_structure = {
        'panseg_model': ['SwinLPanopticSegmentation'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
@@ -0,0 +1,54 @@
 import os.path as osp

 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
    Tasks.image_segmentation, module_name=Models.panoptic_segmentation)
 class SwinLPanopticSegmentation(TorchModel):

    def __init__(self, model_dir: str, **kwargs):
        """str -- model file root."""
        super().__init__(model_dir, **kwargs)

        from mmcv.runner import load_checkpoint
        import mmcv
        from mmdet.models import build_detector

        config = osp.join(model_dir, 'config.py')

        cfg = mmcv.Config.fromfile(config)
        if 'pretrained' in cfg.model:
            cfg.model.pretrained = None
        elif 'init_cfg' in cfg.model.backbone:
            cfg.model.backbone.init_cfg = None

        # build model
        cfg.model.train_cfg = None
        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))

        # load model
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        checkpoint = load_checkpoint(
            self.model, model_path, map_location='cpu')

        self.CLASSES = checkpoint['meta']['CLASSES']
        self.num_classes = len(self.CLASSES)
        self.cfg = cfg

    def inference(self, data):
        """data is dict,contain img and img_metas,follow with mmdet."""

        with torch.no_grad():
            results = self.model(return_loss=False, rescale=True, **data)
        return results

    def forward(self, Inputs):
        import pdb
        pdb.set_trace()
        return self.model(**Inputs)
--- a/modelscope/models/cv/image_reid_person/init.py
+++ b/modelscope/models/cv/image_reid_person/init.py
@@ -0,0 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .pass_model import PASS

 else:
    _import_structure = {
        'pass_model': ['PASS'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/image_reid_person/pass_model.py
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -0,0 +1,136 @@
 # The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
 # https://github.com/CASIA-IVA-Lab/PASS-reID

 import os
 from enum import Enum

 import torch
 import torch.nn as nn

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .transreid_model import vit_base_patch16_224_TransReID


 class Fusions(Enum):
    CAT = 'cat'
    MEAN = 'mean'


@MODELS.register_module(
    Tasks.image_reid_person, module_name=Models.image_reid_person)
 class PASS(TorchModel):

    def __init__(self, cfg: Config, model_dir: str, **kwargs):
        super(PASS, self).__init__(model_dir=model_dir)
        size_train = cfg.INPUT.SIZE_TRAIN
        sie_coe = cfg.MODEL.SIE_COE
        stride_size = cfg.MODEL.STRIDE_SIZE
        drop_path = cfg.MODEL.DROP_PATH
        drop_out = cfg.MODEL.DROP_OUT
        att_drop_rate = cfg.MODEL.ATT_DROP_RATE
        gem_pooling = cfg.MODEL.GEM_POOLING
        stem_conv = cfg.MODEL.STEM_CONV
        weight = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        self.neck_feat = cfg.TEST.NECK_FEAT
        self.dropout_rate = cfg.MODEL.DROPOUT_RATE
        self.num_classes = cfg.DATASETS.NUM_CLASSES
        self.multi_neck = cfg.MODEL.MULTI_NECK
        self.feat_fusion = cfg.MODEL.FEAT_FUSION

        self.base = vit_base_patch16_224_TransReID(
            img_size=size_train,
            sie_xishu=sie_coe,
            stride_size=stride_size,
            drop_path_rate=drop_path,
            drop_rate=drop_out,
            attn_drop_rate=att_drop_rate,
            gem_pool=gem_pooling,
            stem_conv=stem_conv)
        self.in_planes = self.base.in_planes

        if self.feat_fusion == Fusions.CAT.value:
            self.classifier = nn.Linear(
                self.in_planes * 2, self.num_classes, bias=False)
        elif self.feat_fusion == Fusions.MEAN.value:
            self.classifier = nn.Linear(
                self.in_planes, self.num_classes, bias=False)

        if self.multi_neck:
            self.bottleneck = nn.BatchNorm1d(self.in_planes)
            self.bottleneck.bias.requires_grad_(False)
            self.bottleneck_1 = nn.BatchNorm1d(self.in_planes)
            self.bottleneck_1.bias.requires_grad_(False)
            self.bottleneck_2 = nn.BatchNorm1d(self.in_planes)
            self.bottleneck_2.bias.requires_grad_(False)
            self.bottleneck_3 = nn.BatchNorm1d(self.in_planes)
            self.bottleneck_3.bias.requires_grad_(False)
        else:
            if self.feat_fusion == Fusions.CAT.value:
                self.bottleneck = nn.BatchNorm1d(self.in_planes * 2)
                self.bottleneck.bias.requires_grad_(False)
            elif self.feat_fusion == Fusions.MEAN.value:
                self.bottleneck = nn.BatchNorm1d(self.in_planes)
                self.bottleneck.bias.requires_grad_(False)

        self.dropout = nn.Dropout(self.dropout_rate)

        self.load_param(weight)

    def forward(self, input):

        global_feat, local_feat_1, local_feat_2, local_feat_3 = self.base(
            input)

        # single-neck, almost the same performance
        if not self.multi_neck:
            if self.feat_fusion == Fusions.MEAN.value:
                local_feat = local_feat_1 / 3. + local_feat_2 / 3. + local_feat_3 / 3.
                final_feat_before = (global_feat + local_feat) / 2
            elif self.feat_fusion == Fusions.CAT.value:
                final_feat_before = torch.cat(
                    (global_feat, local_feat_1 / 3. + local_feat_2 / 3.
                     + local_feat_3 / 3.),
                    dim=1)

            final_feat_after = self.bottleneck(final_feat_before)
        # multi-neck
        else:
            feat = self.bottleneck(global_feat)
            local_feat_1_bn = self.bottleneck_1(local_feat_1)
            local_feat_2_bn = self.bottleneck_2(local_feat_2)
            local_feat_3_bn = self.bottleneck_3(local_feat_3)

            if self.feat_fusion == Fusions.MEAN.value:
                final_feat_before = ((global_feat + local_feat_1 / 3
                                      + local_feat_2 / 3 + local_feat_3 / 3)
                                     / 2.)
                final_feat_after = (feat + local_feat_1_bn / 3
                                    + local_feat_2_bn / 3
                                    + local_feat_3_bn / 3) / 2.
            elif self.feat_fusion == Fusions.CAT.value:
                final_feat_before = torch.cat(
                    (global_feat, local_feat_1 / 3. + local_feat_2 / 3.
                     + local_feat_3 / 3.),
                    dim=1)
                final_feat_after = torch.cat(
                    (feat, local_feat_1_bn / 3 + local_feat_2_bn / 3
                     + local_feat_3_bn / 3),
                    dim=1)

        if self.neck_feat == 'after':
            return final_feat_after
        else:
            return final_feat_before

    def load_param(self, trained_path):
        param_dict = torch.load(trained_path, map_location='cpu')
        for i in param_dict:
            try:
                self.state_dict()[i.replace('module.',
                                            '')].copy_(param_dict[i])
            except Exception:
                continue
--- a/modelscope/models/cv/image_reid_person/transreid_model.py
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -0,0 +1,418 @@
 # The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
 # https://github.com/CASIA-IVA-Lab/PASS-reID

 import collections.abc as container_abcs
 from functools import partial
 from itertools import repeat

 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 # From PyTorch internals
 def _ntuple(n):

    def parse(x):
        if isinstance(x, container_abcs.Iterable):
            return x
        return tuple(repeat(x, n))

    return parse


 to_2tuple = _ntuple(2)


 def vit_base_patch16_224_TransReID(
        img_size=(256, 128),
        stride_size=16,
        drop_path_rate=0.1,
        camera=0,
        view=0,
        local_feature=False,
        sie_xishu=1.5,
        **kwargs):
    model = TransReID(
        img_size=img_size,
        patch_size=16,
        stride_size=stride_size,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        qkv_bias=True,
        camera=camera,
        view=view,
        drop_path_rate=drop_path_rate,
        sie_xishu=sie_xishu,
        local_feature=local_feature,
        **kwargs)
    return model


 def drop_path(x, drop_prob: float = 0., training: bool = False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.

    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0], ) + (1, ) * (
        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(
        shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


 class TransReID(nn.Module):
    """Transformer-based Object Re-Identification
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 stride_size=16,
                 in_chans=3,
                 num_classes=1000,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 camera=0,
                 view=0,
                 drop_path_rate=0.,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
                 local_feature=False,
                 sie_xishu=1.0,
                 hw_ratio=1,
                 gem_pool=False,
                 stem_conv=False):
        super().__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.local_feature = local_feature
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            stride_size=stride_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            stem_conv=stem_conv)

        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.part_token1 = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.part_token2 = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.part_token3 = nn.Parameter(torch.zeros(1, 1, embed_dim))

        self.cls_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.part1_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.part2_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.part3_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
        self.cam_num = camera
        self.view_num = view
        self.sie_xishu = sie_xishu
        self.in_planes = 768
        self.gem_pool = gem_pool

        # Initialize SIE Embedding
        if camera > 1 and view > 1:
            self.sie_embed = nn.Parameter(
                torch.zeros(camera * view, 1, embed_dim))
        elif camera > 1:
            self.sie_embed = nn.Parameter(torch.zeros(camera, 1, embed_dim))
        elif view > 1:
            self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim))

        self.pos_drop = nn.Dropout(p=drop_rate)
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
               ]  # stochastic depth decay rule

        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer) for i in range(depth)
        ])

        self.norm = norm_layer(embed_dim)

        # Classifier head
        self.fc = nn.Linear(embed_dim,
                            num_classes) if num_classes > 0 else nn.Identity()

        self.gem = GeneralizedMeanPooling()

    def forward_features(self, x, camera_id, view_id):
        B = x.shape[0]
        x = self.patch_embed(x)

        cls_tokens = self.cls_token.expand(
            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        part_tokens1 = self.part_token1.expand(B, -1, -1)
        part_tokens2 = self.part_token2.expand(B, -1, -1)
        part_tokens3 = self.part_token3.expand(B, -1, -1)
        x = torch.cat(
            (cls_tokens, part_tokens1, part_tokens2, part_tokens3, x), dim=1)

        if self.cam_num > 0 and self.view_num > 0:
            x = x + self.pos_embed + self.sie_xishu * self.sie_embed[
                camera_id * self.view_num + view_id]
        elif self.cam_num > 0:
            x = x + self.pos_embed + self.sie_xishu * self.sie_embed[camera_id]
        elif self.view_num > 0:
            x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id]
        else:
            x = x + torch.cat((self.cls_pos, self.part1_pos, self.part2_pos,
                               self.part3_pos, self.pos_embed),
                              dim=1)

        x = self.pos_drop(x)

        if self.local_feature:
            for blk in self.blocks[:-1]:
                x = blk(x)
            return x
        else:
            for blk in self.blocks:
                x = blk(x)

            x = self.norm(x)
        if self.gem_pool:
            gf = self.gem(x[:, 1:].permute(0, 2, 1)).squeeze()
            return x[:, 0] + gf
        return x[:, 0], x[:, 1], x[:, 2], x[:, 3]

    def forward(self, x, cam_label=None, view_label=None):
        global_feat, local_feat_1, local_feat_2, local_feat_3 = self.forward_features(
            x, cam_label, view_label)
        return global_feat, local_feat_1, local_feat_2, local_feat_3


 class PatchEmbed(nn.Module):
    """Image to Patch Embedding with overlapping patches
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 stride_size=16,
                 in_chans=3,
                 embed_dim=768,
                 stem_conv=False):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        stride_size_tuple = to_2tuple(stride_size)
        self.num_x = (img_size[1] - patch_size[1]) // stride_size_tuple[1] + 1
        self.num_y = (img_size[0] - patch_size[0]) // stride_size_tuple[0] + 1
        self.num_patches = self.num_x * self.num_y
        self.img_size = img_size
        self.patch_size = patch_size

        self.stem_conv = stem_conv
        if self.stem_conv:
            hidden_dim = 64
            stem_stride = 2
            stride_size = patch_size = patch_size[0] // stem_stride
            self.conv = nn.Sequential(
                nn.Conv2d(
                    in_chans,
                    hidden_dim,
                    kernel_size=7,
                    stride=stem_stride,
                    padding=3,
                    bias=False),
                IBN(hidden_dim),
                nn.ReLU(inplace=True),
                nn.Conv2d(
                    hidden_dim,
                    hidden_dim,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias=False),
                IBN(hidden_dim),
                nn.ReLU(inplace=True),
                nn.Conv2d(
                    hidden_dim,
                    hidden_dim,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU(inplace=True),
            )
            in_chans = hidden_dim

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=stride_size)

    def forward(self, x):
        if self.stem_conv:
            x = self.conv(x)
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)  # [64, 8, 768]

        return x


 class GeneralizedMeanPooling(nn.Module):
    """Applies a 2D power-average adaptive pooling over an input signal composed of several input planes.
    The function computed is: :math:`f(X) = pow(sum(pow(X, p)), 1/p)`
        - At p = infinity, one gets Max Pooling
        - At p = 1, one gets Average Pooling
    The output is of size H x W, for any input size.
    The number of output features is equal to the number of input planes.
    Args:
        output_size: the target output size of the image of the form H x W.
                     Can be a tuple (H, W) or a single H for a square image H x H
                     H and W can be either a ``int``, or ``None`` which means the size will
                     be the same as that of the input.
    """

    def __init__(self, norm=3, output_size=1, eps=1e-6):
        super(GeneralizedMeanPooling, self).__init__()
        assert norm > 0
        self.p = float(norm)
        self.output_size = output_size
        self.eps = eps

    def forward(self, x):
        x = x.clamp(min=self.eps).pow(self.p)
        return F.adaptive_avg_pool1d(x, self.output_size).pow(1. / self.p)


 class Block(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


 class Attention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


 class Mlp(nn.Module):

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x
--- a/modelscope/models/cv/image_semantic_segmentation/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/init.py
@@ -0,0 +1,24 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .semantic_seg_model import SemanticSegmentation
    from .segformer import Segformer

 else:
    _import_structure = {
        'semantic_seg_model': ['SemanticSegmentation'],
        'segformer': ['Segformer']
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/init.py
@@ -0,0 +1 @@
 from .maskformer_semantic_head import MaskFormerSemanticHead
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
@@ -0,0 +1,47 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod

 from mmcv.runner import BaseModule
 from mmdet.models.builder import build_loss


 class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
    """Base class for panoptic heads."""

    def __init__(self,
                 num_things_classes=80,
                 num_stuff_classes=53,
                 test_cfg=None,
                 loss_panoptic=None,
                 init_cfg=None,
                 **kwargs):
        super(BasePanopticFusionHead, self).__init__(init_cfg)
        self.num_things_classes = num_things_classes
        self.num_stuff_classes = num_stuff_classes
        self.num_classes = num_things_classes + num_stuff_classes
        self.test_cfg = test_cfg

        if loss_panoptic:
            self.loss_panoptic = build_loss(loss_panoptic)
        else:
            self.loss_panoptic = None

    @property
    def with_loss(self):
        """bool: whether the panoptic head contains loss function."""
        return self.loss_panoptic is not None

    @abstractmethod
    def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
        """Forward function during training."""

    @abstractmethod
    def simple_test(self,
                    img_metas,
                    det_labels,
                    mask_preds,
                    seg_preds,
                    det_bboxes,
                    cfg=None,
                    **kwargs):
        """Test without augmentation."""
--- a/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
@@ -0,0 +1,57 @@
 import torch
 import torch.nn.functional as F
 from mmdet.models.builder import HEADS

 from .base_panoptic_fusion_head import BasePanopticFusionHead


@HEADS.register_module()
 class MaskFormerSemanticHead(BasePanopticFusionHead):

    def __init__(self,
                 num_things_classes=80,
                 num_stuff_classes=53,
                 test_cfg=None,
                 loss_panoptic=None,
                 init_cfg=None,
                 **kwargs):
        super().__init__(num_things_classes, num_stuff_classes, test_cfg,
                         loss_panoptic, init_cfg, **kwargs)

    def forward_train(self, **kwargs):
        """MaskFormerFusionHead has no training loss."""
        return dict()

    def simple_test(self,
                    mask_cls_results,
                    mask_pred_results,
                    img_metas,
                    rescale=False,
                    **kwargs):
        results = []
        for mask_cls_result, mask_pred_result, meta in zip(
                mask_cls_results, mask_pred_results, img_metas):
            # remove padding
            img_height, img_width = meta['img_shape'][:2]
            mask_pred_result = mask_pred_result[:, :img_height, :img_width]

            if rescale:
                # return result in original resolution
                ori_height, ori_width = meta['ori_shape'][:2]
                mask_pred_result = F.interpolate(
                    mask_pred_result[:, None],
                    size=(ori_height, ori_width),
                    mode='bilinear',
                    align_corners=False)[:, 0]

            # semantic inference
            cls_score = F.softmax(mask_cls_result, dim=-1)[..., :-1]
            mask_pred = mask_pred_result.sigmoid()
            seg_mask = torch.einsum('qc,qhw->chw', cls_score, mask_pred)
            # still need softmax and argmax
            seg_logit = F.softmax(seg_mask, dim=0)
            seg_pred = seg_logit.argmax(dim=0)
            seg_pred = seg_pred.cpu().numpy()
            results.append(seg_pred)

        return results
--- a/modelscope/models/cv/image_semantic_segmentation/segformer.py
+++ b/modelscope/models/cv/image_semantic_segmentation/segformer.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.segmentation import EncoderDecoder

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks


@MODELS.register_module(
    group_key=Tasks.image_segmentation, module_name=Models.segformer)
 class Segformer(EasyCVBaseModel, EncoderDecoder):

    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        EncoderDecoder.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -0,0 +1,76 @@
 import os.path as osp

 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.image_semantic_segmentation import (pan_merge,
                                                              vit_adapter)
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
    Tasks.image_segmentation, module_name=Models.swinL_semantic_segmentation)
@MODELS.register_module(
    Tasks.image_segmentation,
    module_name=Models.vitadapter_semantic_segmentation)
 class SemanticSegmentation(TorchModel):

    def __init__(self, model_dir: str, **kwargs):
        """str -- model file root."""
        super().__init__(model_dir, **kwargs)

        from mmcv.runner import load_checkpoint
        import mmcv
        from mmdet.models import build_detector

        config = osp.join(model_dir, 'mmcv_config.py')
        cfg = mmcv.Config.fromfile(config)
        if 'pretrained' in cfg.model:
            cfg.model.pretrained = None
        elif 'init_cfg' in cfg.model.backbone:
            cfg.model.backbone.init_cfg = None

        # build model
        cfg.model.train_cfg = None
        self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))

        # load model
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        _ = load_checkpoint(self.model, model_path, map_location='cpu')

        self.CLASSES = cfg['CLASSES']  # list
        self.PALETTE = cfg['PALETTE']  # list

        self.num_classes = len(self.CLASSES)
        self.cfg = cfg

    def forward(self, Inputs):
        return self.model(**Inputs)

    def postprocess(self, Inputs):
        semantic_result = Inputs[0]

        ids = np.unique(semantic_result)[::-1]
        legal_indices = ids != self.model.num_classes  # for VOID label
        ids = ids[legal_indices]

        segms = (semantic_result[None] == ids[:, None, None])
        masks = [it.astype(np.int) for it in segms]
        labels_txt = np.array(self.CLASSES)[ids].tolist()

        results = {
            OutputKeys.MASKS: masks,
            OutputKeys.LABELS: labels_txt,
            OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
        }
        return results

    def inference(self, data):
        with torch.no_grad():
            results = self.model(return_loss=False, rescale=True, **data)

        return results
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/init.py
@@ -0,0 +1,3 @@
 from .models import backbone, decode_heads, segmentors
 from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
                    seg_resize)
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/init.py
@@ -0,0 +1,3 @@
 from .backbone import BASEBEiT, BEiTAdapter
 from .decode_heads import Mask2FormerHeadFromMMSeg
 from .segmentors import EncoderDecoderMask2Former
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/init.py
@@ -0,0 +1,4 @@
 from .base import BASEBEiT
 from .beit_adapter import BEiTAdapter

 __all__ = ['BEiTAdapter', 'BASEBEiT']
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
@@ -0,0 +1,523 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git

 import logging
 from functools import partial

 import torch
 import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmdet.models.utils.transformer import MultiScaleDeformableAttention
 from timm.models.layers import DropPath

 _logger = logging.getLogger(__name__)


 def get_reference_points(spatial_shapes, device):
    reference_points_list = []
    for lvl, (H_, W_) in enumerate(spatial_shapes):
        ref_y, ref_x = torch.meshgrid(
            torch.linspace(
                0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
            torch.linspace(
                0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
        ref_y = ref_y.reshape(-1)[None] / H_
        ref_x = ref_x.reshape(-1)[None] / W_
        ref = torch.stack((ref_x, ref_y), -1)
        reference_points_list.append(ref)
    reference_points = torch.cat(reference_points_list, 1)
    reference_points = reference_points[:, :, None]
    return reference_points


 def deform_inputs(x):
    bs, c, h, w = x.shape
    spatial_shapes = torch.as_tensor([(h // 8, w // 8), (h // 16, w // 16),
                                      (h // 32, w // 32)],
                                     dtype=torch.long,
                                     device=x.device)
    level_start_index = torch.cat((spatial_shapes.new_zeros(
        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
    reference_points = get_reference_points([(h // 16, w // 16)], x.device)
    deform_inputs1 = [reference_points, spatial_shapes, level_start_index]

    spatial_shapes = torch.as_tensor([(h // 16, w // 16)],
                                     dtype=torch.long,
                                     device=x.device)
    level_start_index = torch.cat((spatial_shapes.new_zeros(
        (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
    reference_points = get_reference_points([(h // 8, w // 8),
                                             (h // 16, w // 16),
                                             (h // 32, w // 32)], x.device)
    deform_inputs2 = [reference_points, spatial_shapes, level_start_index]

    return deform_inputs1, deform_inputs2


 class ConvFFN(nn.Module):

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.dwconv = DWConv(hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x, H, W):
        x = self.fc1(x)
        x = self.dwconv(x, H, W)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


 class DWConv(nn.Module):

    def __init__(self, dim=768):
        super().__init__()
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

    def forward(self, x, H, W):
        B, N, C = x.shape
        n = N // 21
        x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2,
                                                    W * 2).contiguous()
        x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H,
                                                         W).contiguous()
        x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2,
                                                   W // 2).contiguous()
        x1 = self.dwconv(x1).flatten(2).transpose(1, 2)
        x2 = self.dwconv(x2).flatten(2).transpose(1, 2)
        x3 = self.dwconv(x3).flatten(2).transpose(1, 2)
        x = torch.cat([x1, x2, x3], dim=1)
        return x


 class Extractor(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=6,
                 n_points=4,
                 n_levels=1,
                 deform_ratio=1.0,
                 with_cffn=True,
                 cffn_ratio=0.25,
                 drop=0.,
                 drop_path=0.,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
                 with_cp=False):
        super().__init__()
        self.query_norm = norm_layer(dim)
        self.feat_norm = norm_layer(dim)
        self.attn = MultiScaleDeformableAttention(
            embed_dims=dim,
            num_heads=num_heads,
            num_levels=n_levels,
            num_points=n_points,
            batch_first=True)

        # modify to fit the deform_ratio
        value_proj_in_features = self.attn.value_proj.weight.shape[0]
        value_proj_out_features = int(value_proj_in_features * deform_ratio)
        self.attn.value_proj = nn.Linear(value_proj_in_features,
                                         value_proj_out_features)
        self.attn.output_proj = nn.Linear(value_proj_out_features,
                                          value_proj_in_features)

        self.with_cffn = with_cffn
        self.with_cp = with_cp
        if with_cffn:
            self.ffn = ConvFFN(
                in_features=dim,
                hidden_features=int(dim * cffn_ratio),
                drop=drop)
            self.ffn_norm = norm_layer(dim)
            self.drop_path = DropPath(
                drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, query, reference_points, feat, spatial_shapes,
                level_start_index, H, W):

        def _inner_forward(query, feat):
            attn = self.attn(
                query=self.query_norm(query),
                key=None,
                value=self.feat_norm(feat),
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=reference_points,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index)

            query = query + attn

            if self.with_cffn:
                query = query + self.drop_path(
                    self.ffn(self.ffn_norm(query), H, W))
            return query

        if self.with_cp and query.requires_grad:
            query = cp.checkpoint(_inner_forward, query, feat)
        else:
            query = _inner_forward(query, feat)

        return query


 class Injector(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=6,
                 n_points=4,
                 n_levels=1,
                 deform_ratio=1.0,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
                 init_values=0.,
                 with_cp=False):
        super().__init__()
        self.with_cp = with_cp
        self.query_norm = norm_layer(dim)
        self.feat_norm = norm_layer(dim)
        self.attn = MultiScaleDeformableAttention(
            embed_dims=dim,
            num_heads=num_heads,
            num_levels=n_levels,
            num_points=n_points,
            batch_first=True)

        # modify to fit the deform_ratio
        value_proj_in_features = self.attn.value_proj.weight.shape[0]
        value_proj_out_features = int(value_proj_in_features * deform_ratio)
        self.attn.value_proj = nn.Linear(value_proj_in_features,
                                         value_proj_out_features)
        self.attn.output_proj = nn.Linear(value_proj_out_features,
                                          value_proj_in_features)

        self.gamma = nn.Parameter(
            init_values * torch.ones((dim)), requires_grad=True)

    def forward(self, query, reference_points, feat, spatial_shapes,
                level_start_index):

        def _inner_forward(query, feat):
            input_query = self.query_norm(query)
            input_value = self.feat_norm(feat)
            attn = self.attn(
                query=input_query,
                key=None,
                value=input_value,
                identity=None,
                query_pos=None,
                key_padding_mask=None,
                reference_points=reference_points,
                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index)
            return query + self.gamma * attn

        if self.with_cp and query.requires_grad:
            query = cp.checkpoint(_inner_forward, query, feat)
        else:
            query = _inner_forward(query, feat)

        return query


 class InteractionBlock(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=6,
                 n_points=4,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
                 drop=0.,
                 drop_path=0.,
                 with_cffn=True,
                 cffn_ratio=0.25,
                 init_values=0.,
                 deform_ratio=1.0,
                 extra_extractor=False,
                 with_cp=False):
        super().__init__()

        self.injector = Injector(
            dim=dim,
            n_levels=3,
            num_heads=num_heads,
            init_values=init_values,
            n_points=n_points,
            norm_layer=norm_layer,
            deform_ratio=deform_ratio,
            with_cp=with_cp)
        self.extractor = Extractor(
            dim=dim,
            n_levels=1,
            num_heads=num_heads,
            n_points=n_points,
            norm_layer=norm_layer,
            deform_ratio=deform_ratio,
            with_cffn=with_cffn,
            cffn_ratio=cffn_ratio,
            drop=drop,
            drop_path=drop_path,
            with_cp=with_cp)
        if extra_extractor:
            self.extra_extractors = nn.Sequential(*[
                Extractor(
                    dim=dim,
                    num_heads=num_heads,
                    n_points=n_points,
                    norm_layer=norm_layer,
                    with_cffn=with_cffn,
                    cffn_ratio=cffn_ratio,
                    deform_ratio=deform_ratio,
                    drop=drop,
                    drop_path=drop_path,
                    with_cp=with_cp) for _ in range(2)
            ])
        else:
            self.extra_extractors = None

    def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
        x = self.injector(
            query=x,
            reference_points=deform_inputs1[0],
            feat=c,
            spatial_shapes=deform_inputs1[1],
            level_start_index=deform_inputs1[2])
        for idx, blk in enumerate(blocks):
            x = blk(x, H, W)
        c = self.extractor(
            query=c,
            reference_points=deform_inputs2[0],
            feat=x,
            spatial_shapes=deform_inputs2[1],
            level_start_index=deform_inputs2[2],
            H=H,
            W=W)
        if self.extra_extractors is not None:
            for extractor in self.extra_extractors:
                c = extractor(
                    query=c,
                    reference_points=deform_inputs2[0],
                    feat=x,
                    spatial_shapes=deform_inputs2[1],
                    level_start_index=deform_inputs2[2],
                    H=H,
                    W=W)
        return x, c


 class InteractionBlockWithCls(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=6,
                 n_points=4,
                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
                 drop=0.,
                 drop_path=0.,
                 with_cffn=True,
                 cffn_ratio=0.25,
                 init_values=0.,
                 deform_ratio=1.0,
                 extra_extractor=False,
                 with_cp=False):
        super().__init__()

        self.injector = Injector(
            dim=dim,
            n_levels=3,
            num_heads=num_heads,
            init_values=init_values,
            n_points=n_points,
            norm_layer=norm_layer,
            deform_ratio=deform_ratio,
            with_cp=with_cp)
        self.extractor = Extractor(
            dim=dim,
            n_levels=1,
            num_heads=num_heads,
            n_points=n_points,
            norm_layer=norm_layer,
            deform_ratio=deform_ratio,
            with_cffn=with_cffn,
            cffn_ratio=cffn_ratio,
            drop=drop,
            drop_path=drop_path,
            with_cp=with_cp)
        if extra_extractor:
            self.extra_extractors = nn.Sequential(*[
                Extractor(
                    dim=dim,
                    num_heads=num_heads,
                    n_points=n_points,
                    norm_layer=norm_layer,
                    with_cffn=with_cffn,
                    cffn_ratio=cffn_ratio,
                    deform_ratio=deform_ratio,
                    drop=drop,
                    drop_path=drop_path,
                    with_cp=with_cp) for _ in range(2)
            ])
        else:
            self.extra_extractors = None

    def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
        x = self.injector(
            query=x,
            reference_points=deform_inputs1[0],
            feat=c,
            spatial_shapes=deform_inputs1[1],
            level_start_index=deform_inputs1[2])
        x = torch.cat((cls, x), dim=1)
        for idx, blk in enumerate(blocks):
            x = blk(x, H, W)
        cls, x = x[:, :1, ], x[:, 1:, ]
        c = self.extractor(
            query=c,
            reference_points=deform_inputs2[0],
            feat=x,
            spatial_shapes=deform_inputs2[1],
            level_start_index=deform_inputs2[2],
            H=H,
            W=W)
        if self.extra_extractors is not None:
            for extractor in self.extra_extractors:
                c = extractor(
                    query=c,
                    reference_points=deform_inputs2[0],
                    feat=x,
                    spatial_shapes=deform_inputs2[1],
                    level_start_index=deform_inputs2[2],
                    H=H,
                    W=W)
        return x, c, cls


 class SpatialPriorModule(nn.Module):

    def __init__(self, inplanes=64, embed_dim=384, with_cp=False):
        super().__init__()
        self.with_cp = with_cp

        self.stem = nn.Sequential(*[
            nn.Conv2d(
                3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
            nn.SyncBatchNorm(inplanes),
            nn.ReLU(inplace=True),
            nn.Conv2d(
                inplanes,
                inplanes,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=False),
            nn.SyncBatchNorm(inplanes),
            nn.ReLU(inplace=True),
            nn.Conv2d(
                inplanes,
                inplanes,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=False),
            nn.SyncBatchNorm(inplanes),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        ])
        self.conv2 = nn.Sequential(*[
            nn.Conv2d(
                inplanes,
                2 * inplanes,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False),
            nn.SyncBatchNorm(2 * inplanes),
            nn.ReLU(inplace=True)
        ])
        self.conv3 = nn.Sequential(*[
            nn.Conv2d(
                2 * inplanes,
                4 * inplanes,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False),
            nn.SyncBatchNorm(4 * inplanes),
            nn.ReLU(inplace=True)
        ])
        self.conv4 = nn.Sequential(*[
            nn.Conv2d(
                4 * inplanes,
                4 * inplanes,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False),
            nn.SyncBatchNorm(4 * inplanes),
            nn.ReLU(inplace=True)
        ])
        self.fc1 = nn.Conv2d(
            inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
        self.fc2 = nn.Conv2d(
            2 * inplanes,
            embed_dim,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.fc3 = nn.Conv2d(
            4 * inplanes,
            embed_dim,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)
        self.fc4 = nn.Conv2d(
            4 * inplanes,
            embed_dim,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True)

    def forward(self, x):

        def _inner_forward(x):
            c1 = self.stem(x)
            c2 = self.conv2(c1)
            c3 = self.conv3(c2)
            c4 = self.conv4(c3)
            c1 = self.fc1(c1)
            c2 = self.fc2(c2)
            c3 = self.fc3(c3)
            c4 = self.fc4(c4)

            bs, dim, _, _ = c1.shape

            c2 = c2.view(bs, dim, -1).transpose(1, 2)  # 8s
            c3 = c3.view(bs, dim, -1).transpose(1, 2)  # 16s
            c4 = c4.view(bs, dim, -1).transpose(1, 2)  # 32s

            return c1, c2, c3, c4

        if self.with_cp and x.requires_grad:
            outs = cp.checkpoint(_inner_forward, x)
        else:
            outs = _inner_forward(x)
        return outs
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/init.py
@@ -0,0 +1,3 @@
 from .beit import BASEBEiT

 __all__ = ['BASEBEiT']
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
@@ -0,0 +1,476 @@
 # BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
 # Github source: https://github.com/microsoft/unilm/tree/master/beit
 # This implementation refers to
 # https://github.com/czczup/ViT-Adapter.git
 import math
 from functools import partial

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as cp
 from mmcv.runner import _load_checkpoint
 from mmdet.models.builder import BACKBONES
 from mmdet.utils import get_root_logger
 from timm.models.layers import drop_path, to_2tuple, trunc_normal_


 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
    residual blocks)."""

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return 'p={}'.format(self.drop_prob)


 class Mlp(nn.Module):

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        # commit dropout for the original BERT implement
        x = self.fc2(x)
        x = self.drop(x)
        return x


 class Attention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.,
                 window_size=None,
                 attn_head_dim=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        if attn_head_dim is not None:
            head_dim = attn_head_dim
        all_head_dim = head_dim * self.num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
        if qkv_bias:
            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
        else:
            self.q_bias = None
            self.v_bias = None

        if window_size:
            self.window_size = window_size
            self.num_relative_distance = (2 * window_size[0]
                                          - 1) * (2 * window_size[1] - 1) + 3
            self.relative_position_bias_table = nn.Parameter(
                torch.zeros(self.num_relative_distance,
                            num_heads))  # 2*Wh-1 * 2*Ww-1, nH
            # cls to token & token 2 cls & cls to cls

            # get pair-wise relative position index for each token inside the window
            coords_h = torch.arange(window_size[0])
            coords_w = torch.arange(window_size[1])
            coords = torch.stack(torch.meshgrid([coords_h,
                                                 coords_w]))  # 2, Wh, Ww
            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
            relative_coords = coords_flatten[:, :,
                                             None] - coords_flatten[:,
                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
            relative_coords = relative_coords.permute(
                1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
            relative_coords[:, :,
                            0] += window_size[0] - 1  # shift to start from 0
            relative_coords[:, :, 1] += window_size[1] - 1
            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
            relative_position_index = \
                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
            relative_position_index[1:, 1:] = relative_coords.sum(
                -1)  # Wh*Ww, Wh*Ww
            relative_position_index[0, 0:] = self.num_relative_distance - 3
            relative_position_index[0:, 0] = self.num_relative_distance - 2
            relative_position_index[0, 0] = self.num_relative_distance - 1
            self.register_buffer('relative_position_index',
                                 relative_position_index)

        else:
            self.window_size = None
            self.relative_position_bias_table = None
            self.relative_position_index = None

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(all_head_dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x, rel_pos_bias=None):
        B, N, C = x.shape
        qkv_bias = None
        if self.q_bias is not None:
            qkv_bias = torch.cat(
                (self.q_bias,
                 torch.zeros_like(self.v_bias,
                                  requires_grad=False), self.v_bias))

        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[
            2]  # make torchscript happy (cannot use tensor as tuple)

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))

        if self.relative_position_bias_table is not None:
            relative_position_bias = \
                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
                    self.window_size[0] * self.window_size[1] + 1,
                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
            relative_position_bias = relative_position_bias.permute(
                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww

            attn = attn + relative_position_bias.unsqueeze(0)

        if rel_pos_bias is not None:
            attn = attn + rel_pos_bias

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class Block(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 init_values=None,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 window_size=None,
                 attn_head_dim=None,
                 with_cp=False):
        super().__init__()
        self.with_cp = with_cp
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop,
            window_size=window_size,
            attn_head_dim=attn_head_dim)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        if init_values is not None:
            self.gamma_1 = nn.Parameter(
                init_values * torch.ones((dim)), requires_grad=True)
            self.gamma_2 = nn.Parameter(
                init_values * torch.ones((dim)), requires_grad=True)
        else:
            self.gamma_1, self.gamma_2 = None, None

    def forward(self, x, H, W, rel_pos_bias=None):

        def _inner_forward(x):
            if self.gamma_1 is None:
                x = x + self.drop_path(
                    self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
                x = x + self.drop_path(self.mlp(self.norm2(x)))
            else:
                x = x + self.drop_path(self.gamma_1 * self.attn(
                    self.norm1(x), rel_pos_bias=rel_pos_bias))
                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
            return x

        if self.with_cp and x.requires_grad:
            x = cp.checkpoint(_inner_forward, x)
        else:
            x = _inner_forward(x)
        return x


 class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (
            img_size[0] // patch_size[0])
        self.patch_shape = (img_size[0] // patch_size[0],
                            img_size[1] // patch_size[1])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x, **kwargs):
        B, C, H, W = x.shape
        # FIXME look at relaxing size constraints
        # assert H == self.img_size[0] and W == self.img_size[1], \
        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        x = self.proj(x)
        Hp, Wp = x.shape[2], x.shape[3]

        x = x.flatten(2).transpose(1, 2)
        return x, Hp, Wp


 class HybridEmbed(nn.Module):
    """ CNN Feature Map Embedding
    Extract feature map from CNN, flatten, project to embedding dim.
    """

    def __init__(self,
                 backbone,
                 img_size=224,
                 feature_size=None,
                 in_chans=3,
                 embed_dim=768):
        super().__init__()
        assert isinstance(backbone, nn.Module)
        img_size = to_2tuple(img_size)
        self.img_size = img_size
        self.backbone = backbone
        if feature_size is None:
            with torch.no_grad():
                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
                # map for all networks, the feature metadata has reliable channel and stride info, but using
                # stride to calc feature dim requires info about padding of each stage that isn't captured.
                training = backbone.training
                if training:
                    backbone.eval()
                o = self.backbone(
                    torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
                feature_size = o.shape[-2:]
                feature_dim = o.shape[1]
                backbone.train(training)
        else:
            feature_size = to_2tuple(feature_size)
            feature_dim = self.backbone.feature_info.channels()[-1]
        self.num_patches = feature_size[0] * feature_size[1]
        self.proj = nn.Linear(feature_dim, embed_dim)

    def forward(self, x):
        x = self.backbone(x)[-1]
        x = x.flatten(2).transpose(1, 2)
        x = self.proj(x)
        return x


 class RelativePositionBias(nn.Module):

    def __init__(self, window_size, num_heads):
        super().__init__()
        self.window_size = window_size
        self.num_relative_distance = (2 * window_size[0]
                                      - 1) * (2 * window_size[1] - 1) + 3
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros(self.num_relative_distance,
                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
        # cls to token & token 2 cls & cls to cls

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(window_size[0])
        coords_w = torch.arange(window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :,
                                         None] - coords_flatten[:,
                                                                None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(
            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
        relative_position_index = \
            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
        relative_position_index[1:,
                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        relative_position_index[0, 0:] = self.num_relative_distance - 3
        relative_position_index[0:, 0] = self.num_relative_distance - 2
        relative_position_index[0, 0] = self.num_relative_distance - 1

        self.register_buffer('relative_position_index',
                             relative_position_index)

    def forward(self):
        relative_position_bias = \
            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
                self.window_size[0] * self.window_size[1] + 1,
                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
        return relative_position_bias.permute(
            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww


@BACKBONES.register_module()
 class BASEBEiT(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """

    def __init__(self,
                 img_size=512,
                 patch_size=16,
                 in_chans=3,
                 num_classes=80,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 hybrid_backbone=None,
                 norm_layer=None,
                 init_values=None,
                 use_checkpoint=False,
                 use_abs_pos_emb=False,
                 use_rel_pos_bias=True,
                 use_shared_rel_pos_bias=False,
                 pretrained=None,
                 with_cp=False):
        super().__init__()
        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
        self.norm_layer = norm_layer
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.drop_path_rate = drop_path_rate
        if hybrid_backbone is not None:
            self.patch_embed = HybridEmbed(
                hybrid_backbone,
                img_size=img_size,
                in_chans=in_chans,
                embed_dim=embed_dim)
        else:
            self.patch_embed = PatchEmbed(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        if use_abs_pos_emb:
            self.pos_embed = nn.Parameter(
                torch.zeros(1, num_patches + 1, embed_dim))
        else:
            self.pos_embed = None
        self.pos_drop = nn.Dropout(p=drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(
                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
        else:
            self.rel_pos_bias = None

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
               ]  # stochastic depth decay rule
        self.use_rel_pos_bias = use_rel_pos_bias
        self.use_checkpoint = use_checkpoint
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                with_cp=with_cp,
                init_values=init_values,
                window_size=self.patch_embed.patch_shape
                if use_rel_pos_bias else None) for i in range(depth)
        ])

        trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)
        self.init_weights(pretrained)

    def init_weights(self, pretrained=None):
        """Initialize the weights in backbone.

        Args:
            pretrained (str, optional): Path to pre-trained weights.
                Defaults to None.
        """
        if isinstance(pretrained, str):
            logger = get_root_logger()
            init_cfg = dict(type='Pretrained', checkpoint=pretrained)

            checkpoint = _load_checkpoint(
                init_cfg['checkpoint'], logger=logger, map_location='cpu')
            state_dict = self.resize_rel_pos_embed(checkpoint)
            self.load_state_dict(state_dict, False)

    def fix_init_weight(self):

        def rescale(param, layer_id):
            param.div_(math.sqrt(2.0 * layer_id))

        for layer_id, layer in enumerate(self.blocks):
            rescale(layer.attn.proj.weight.data, layer_id + 1)
            rescale(layer.mlp.fc2.weight.data, layer_id + 1)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def get_num_layers(self):
        return len(self.blocks)
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
@@ -0,0 +1,169 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git
 import logging
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmdet.models.builder import BACKBONES
 from mmdet.models.utils.transformer import MultiScaleDeformableAttention
 from timm.models.layers import DropPath, trunc_normal_
 from torch.nn.init import normal_

 from .adapter_modules import InteractionBlockWithCls as InteractionBlock
 from .adapter_modules import SpatialPriorModule, deform_inputs
 from .base.beit import BASEBEiT

 _logger = logging.getLogger(__name__)


@BACKBONES.register_module()
 class BEiTAdapter(BASEBEiT):

    def __init__(self,
                 pretrain_size=224,
                 conv_inplane=64,
                 n_points=4,
                 deform_num_heads=6,
                 init_values=0.,
                 cffn_ratio=0.25,
                 deform_ratio=1.0,
                 with_cffn=True,
                 interaction_indexes=None,
                 add_vit_feature=True,
                 with_cp=False,
                 *args,
                 **kwargs):

        super().__init__(
            init_values=init_values, with_cp=with_cp, *args, **kwargs)

        self.num_block = len(self.blocks)
        self.pretrain_size = (pretrain_size, pretrain_size)
        self.flags = [
            i for i in range(-1, self.num_block, self.num_block // 4)
        ][1:]
        self.interaction_indexes = interaction_indexes
        self.add_vit_feature = add_vit_feature
        embed_dim = self.embed_dim

        self.level_embed = nn.Parameter(torch.zeros(3, embed_dim))
        self.spm = SpatialPriorModule(
            inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False)
        self.interactions = nn.Sequential(*[
            InteractionBlock(
                dim=embed_dim,
                num_heads=deform_num_heads,
                n_points=n_points,
                init_values=init_values,
                drop_path=self.drop_path_rate,
                norm_layer=self.norm_layer,
                with_cffn=with_cffn,
                cffn_ratio=cffn_ratio,
                deform_ratio=deform_ratio,
                extra_extractor=True if i == len(interaction_indexes)
                - 1 else False,
                with_cp=with_cp) for i in range(len(interaction_indexes))
        ])

        self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
        self.norm1 = nn.SyncBatchNorm(embed_dim)
        self.norm2 = nn.SyncBatchNorm(embed_dim)
        self.norm3 = nn.SyncBatchNorm(embed_dim)
        self.norm4 = nn.SyncBatchNorm(embed_dim)

        self.up.apply(self._init_weights)
        self.spm.apply(self._init_weights)
        self.interactions.apply(self._init_weights)
        self.apply(self._init_deform_weights)
        normal_(self.level_embed)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
            fan_out //= m.groups
            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
            if m.bias is not None:
                m.bias.data.zero_()

    def _get_pos_embed(self, pos_embed, H, W):
        pos_embed = pos_embed.reshape(1, self.pretrain_size[0] // 16,
                                      self.pretrain_size[1] // 16,
                                      -1).permute(0, 3, 1, 2)
        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
            reshape(1, -1, H * W).permute(0, 2, 1)
        return pos_embed

    def _init_deform_weights(self, m):
        if isinstance(m, MultiScaleDeformableAttention):
            m.init_weights()

    def _add_level_embed(self, c2, c3, c4):
        c2 = c2 + self.level_embed[0]
        c3 = c3 + self.level_embed[1]
        c4 = c4 + self.level_embed[2]
        return c2, c3, c4

    def forward(self, x):
        deform_inputs1, deform_inputs2 = deform_inputs(x)

        # SPM forward
        c1, c2, c3, c4 = self.spm(x)
        c2, c3, c4 = self._add_level_embed(c2, c3, c4)
        c = torch.cat([c2, c3, c4], dim=1)

        # Patch Embedding forward
        x, H, W = self.patch_embed(x)
        bs, n, dim = x.shape
        cls = self.cls_token.expand(
            bs, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks

        if self.pos_embed is not None:
            pos_embed = self._get_pos_embed(self.pos_embed, H, W)
            x = x + pos_embed
        x = self.pos_drop(x)

        # Interaction
        outs = list()
        for i, layer in enumerate(self.interactions):
            indexes = self.interaction_indexes[i]
            x, c, cls = layer(x, c, cls,
                              self.blocks[indexes[0]:indexes[-1] + 1],
                              deform_inputs1, deform_inputs2, H, W)
            outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous())

        # Split & Reshape
        c2 = c[:, 0:c2.size(1), :]
        c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :]
        c4 = c[:, c2.size(1) + c3.size(1):, :]

        c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous()
        c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous()
        c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous()
        c1 = self.up(c2) + c1

        if self.add_vit_feature:
            x1, x2, x3, x4 = outs
            x1 = F.interpolate(
                x1, scale_factor=4, mode='bilinear', align_corners=False)
            x2 = F.interpolate(
                x2, scale_factor=2, mode='bilinear', align_corners=False)
            x4 = F.interpolate(
                x4, scale_factor=0.5, mode='bilinear', align_corners=False)
            c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4

        # Final Norm
        f1 = self.norm1(c1)
        f2 = self.norm2(c2)
        f3 = self.norm3(c3)
        f4 = self.norm4(c4)
        return [f1, f2, f3, f4]
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/init.py
@@ -0,0 +1,3 @@
 from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg

 __all__ = ['Mask2FormerHeadFromMMSeg']
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
@@ -0,0 +1,267 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git
 from abc import ABCMeta, abstractmethod

 import torch
 import torch.nn as nn
 from mmcv.runner import BaseModule, auto_fp16, force_fp32
 from mmdet.models.builder import build_loss
 from mmdet.models.losses import accuracy

 from ...utils import build_pixel_sampler, seg_resize


 class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
    """Base class for BaseDecodeHead.

    Args:
        in_channels (int|Sequence[int]): Input channels.
        channels (int): Channels after modules, before conv_seg.
        num_classes (int): Number of classes.
        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
        conv_cfg (dict|None): Config of conv layers. Default: None.
        norm_cfg (dict|None): Config of norm layers. Default: None.
        act_cfg (dict): Config of activation layers.
            Default: dict(type='ReLU')
        in_index (int|Sequence[int]): Input feature index. Default: -1
        input_transform (str|None): Transformation type of input features.
            Options: 'resize_concat', 'multiple_select', None.
            'resize_concat': Multiple feature maps will be resize to the
                same size as first one and than concat together.
                Usually used in FCN head of HRNet.
            'multiple_select': Multiple feature maps will be bundle into
                a list and passed into decode head.
            None: Only one select feature map is allowed.
            Default: None.
        loss_decode (dict | Sequence[dict]): Config of decode loss.
            The `loss_name` is property of corresponding loss function which
            could be shown in training log. If you want this loss
            item to be included into the backward graph, `loss_` must be the
            prefix of the name. Defaults to 'loss_ce'.
             e.g. dict(type='CrossEntropyLoss'),
             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
              dict(type='DiceLoss', loss_name='loss_dice')]
            Default: dict(type='CrossEntropyLoss').
        ignore_index (int | None): The label index to be ignored. When using
            masked BCE loss, ignore_index should be set to None. Default: 255.
        sampler (dict|None): The config of segmentation map sampler.
            Default: None.
        align_corners (bool): align_corners argument of F.interpolate.
            Default: False.
        init_cfg (dict or list[dict], optional): Initialization config dict.
    """

    def __init__(self,
                 in_channels,
                 channels,
                 *,
                 num_classes,
                 dropout_ratio=0.1,
                 conv_cfg=None,
                 norm_cfg=None,
                 act_cfg=dict(type='ReLU'),
                 in_index=-1,
                 input_transform=None,
                 loss_decode=dict(
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0),
                 ignore_index=255,
                 sampler=None,
                 align_corners=False,
                 init_cfg=dict(
                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
        super(BaseDecodeHead, self).__init__(init_cfg)
        self._init_inputs(in_channels, in_index, input_transform)
        self.channels = channels
        self.num_classes = num_classes
        self.dropout_ratio = dropout_ratio
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.in_index = in_index

        self.ignore_index = ignore_index
        self.align_corners = align_corners

        if isinstance(loss_decode, dict):
            self.loss_decode = build_loss(loss_decode)
        elif isinstance(loss_decode, (list, tuple)):
            self.loss_decode = nn.ModuleList()
            for loss in loss_decode:
                self.loss_decode.append(build_loss(loss))
        else:
            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
                but got {type(loss_decode)}')

        if sampler is not None:
            self.sampler = build_pixel_sampler(sampler, context=self)
        else:
            self.sampler = None

        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
        if dropout_ratio > 0:
            self.dropout = nn.Dropout2d(dropout_ratio)
        else:
            self.dropout = None
        self.fp16_enabled = False

    def extra_repr(self):
        """Extra repr."""
        s = f'input_transform={self.input_transform}, ' \
            f'ignore_index={self.ignore_index}, ' \
            f'align_corners={self.align_corners}'
        return s

    def _init_inputs(self, in_channels, in_index, input_transform):
        """Check and initialize input transforms.

        The in_channels, in_index and input_transform must match.
        Specifically, when input_transform is None, only single feature map
        will be selected. So in_channels and in_index must be of type int.
        When input_transform

        Args:
            in_channels (int|Sequence[int]): Input channels.
            in_index (int|Sequence[int]): Input feature index.
            input_transform (str|None): Transformation type of input features.
                Options: 'resize_concat', 'multiple_select', None.
                'resize_concat': Multiple feature maps will be resize to the
                    same size as first one and than concat together.
                    Usually used in FCN head of HRNet.
                'multiple_select': Multiple feature maps will be bundle into
                    a list and passed into decode head.
                None: Only one select feature map is allowed.
        """

        if input_transform is not None:
            assert input_transform in ['resize_concat', 'multiple_select']
        self.input_transform = input_transform
        self.in_index = in_index
        if input_transform is not None:
            assert isinstance(in_channels, (list, tuple))
            assert isinstance(in_index, (list, tuple))
            assert len(in_channels) == len(in_index)
            if input_transform == 'resize_concat':
                self.in_channels = sum(in_channels)
            else:
                self.in_channels = in_channels
        else:
            assert isinstance(in_channels, int)
            assert isinstance(in_index, int)
            self.in_channels = in_channels

    def _transform_inputs(self, inputs):
        """Transform inputs for decoder.

        Args:
            inputs (list[Tensor]): List of multi-level img features.

        Returns:
            Tensor: The transformed inputs
        """

        if self.input_transform == 'resize_concat':
            inputs = [inputs[i] for i in self.in_index]
            upsampled_inputs = [
                seg_resize(
                    input=x,
                    size=inputs[0].shape[2:],
                    mode='bilinear',
                    align_corners=self.align_corners) for x in inputs
            ]
            inputs = torch.cat(upsampled_inputs, dim=1)
        elif self.input_transform == 'multiple_select':
            inputs = [inputs[i] for i in self.in_index]
        else:
            inputs = inputs[self.in_index]

        return inputs

    @auto_fp16()
    @abstractmethod
    def forward(self, inputs):
        """Placeholder of forward function."""
        pass

    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
        """Forward function for training.
        Args:
            inputs (list[Tensor]): List of multi-level img features.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            gt_semantic_seg (Tensor): Semantic segmentation masks
                used if the architecture supports semantic segmentation task.
            train_cfg (dict): The training config.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        seg_logits = self.forward(inputs)
        losses = self.losses(seg_logits, gt_semantic_seg)
        return losses

    def forward_test(self, inputs, img_metas, test_cfg):
        """Forward function for testing.

        Args:
            inputs (list[Tensor]): List of multi-level img features.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            test_cfg (dict): The testing config.

        Returns:
            Tensor: Output segmentation map.
        """
        return self.forward(inputs)

    def cls_seg(self, feat):
        """Classify each pixel."""
        if self.dropout is not None:
            feat = self.dropout(feat)
        output = self.conv_seg(feat)
        return output

    @force_fp32(apply_to=('seg_logit', ))
    def losses(self, seg_logit, seg_label):
        """Compute segmentation loss."""
        loss = dict()
        seg_logit = seg_resize(
            input=seg_logit,
            size=seg_label.shape[2:],
            mode='bilinear',
            align_corners=self.align_corners)
        if self.sampler is not None:
            seg_weight = self.sampler.sample(seg_logit, seg_label)
        else:
            seg_weight = None
        seg_label = seg_label.squeeze(1)

        if not isinstance(self.loss_decode, nn.ModuleList):
            losses_decode = [self.loss_decode]
        else:
            losses_decode = self.loss_decode
        for loss_decode in losses_decode:
            if loss_decode.loss_name not in loss:
                loss[loss_decode.loss_name] = loss_decode(
                    seg_logit,
                    seg_label,
                    weight=seg_weight,
                    ignore_index=self.ignore_index)
            else:
                loss[loss_decode.loss_name] += loss_decode(
                    seg_logit,
                    seg_label,
                    weight=seg_weight,
                    ignore_index=self.ignore_index)

        loss['acc_seg'] = accuracy(
            seg_logit, seg_label, ignore_index=self.ignore_index)
        return loss
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
@@ -0,0 +1,581 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git

 import copy

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
 from mmcv.cnn.bricks.transformer import (build_positional_encoding,
                                         build_transformer_layer_sequence)
 from mmcv.ops import point_sample
 from mmcv.runner import ModuleList, force_fp32
 from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
 from mmdet.models.builder import HEADS, build_loss
 from mmdet.models.utils import get_uncertain_point_coords_with_randomness

 from .base_decode_head import BaseDecodeHead


@HEADS.register_module()
 class Mask2FormerHeadFromMMSeg(BaseDecodeHead):
    """Implements the Mask2Former head.

    See `Masked-attention Mask Transformer for Universal Image
    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.

    Args:
        in_channels (list[int]): Number of channels in the input feature map.
        feat_channels (int): Number of channels for features.
        out_channels (int): Number of channels for output.
        num_things_classes (int): Number of things.
        num_stuff_classes (int): Number of stuff.
        num_queries (int): Number of query in Transformer decoder.
        pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
            decoder. Defaults to None.
        enforce_decoder_input_project (bool, optional): Whether to add
            a layer to change the embed_dim of tranformer encoder in
            pixel decoder to the embed_dim of transformer decoder.
            Defaults to False.
        transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
            transformer decoder. Defaults to None.
        positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
            transformer decoder position encoding. Defaults to None.
        loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
            loss. Defaults to None.
        loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
            Defaults to None.
        loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
            Defaults to None.
        train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
            Mask2Former head.
        test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
            Mask2Former head.
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Defaults to None.
    """

    def __init__(self,
                 in_channels,
                 feat_channels,
                 out_channels,
                 num_things_classes=80,
                 num_stuff_classes=53,
                 num_queries=100,
                 num_transformer_feat_level=3,
                 pixel_decoder=None,
                 enforce_decoder_input_project=False,
                 transformer_decoder=None,
                 positional_encoding=None,
                 loss_cls=None,
                 loss_mask=None,
                 loss_dice=None,
                 train_cfg=None,
                 test_cfg=None,
                 init_cfg=None,
                 **kwargs):
        super(Mask2FormerHeadFromMMSeg, self).__init__(
            in_channels=in_channels,
            channels=feat_channels,
            num_classes=(num_things_classes + num_stuff_classes),
            init_cfg=init_cfg,
            input_transform='multiple_select',
            **kwargs)
        self.num_things_classes = num_things_classes
        self.num_stuff_classes = num_stuff_classes
        self.num_classes = self.num_things_classes + self.num_stuff_classes
        self.num_queries = num_queries
        self.num_transformer_feat_level = num_transformer_feat_level
        self.num_heads = transformer_decoder.transformerlayers. \
            attn_cfgs.num_heads
        self.num_transformer_decoder_layers = transformer_decoder.num_layers
        assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level
        pixel_decoder_ = copy.deepcopy(pixel_decoder)
        pixel_decoder_.update(
            in_channels=in_channels,
            feat_channels=feat_channels,
            out_channels=out_channels)
        self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
        self.transformer_decoder = build_transformer_layer_sequence(
            transformer_decoder)
        self.decoder_embed_dims = self.transformer_decoder.embed_dims

        self.decoder_input_projs = ModuleList()
        # from low resolution to high resolution
        for _ in range(num_transformer_feat_level):
            if (self.decoder_embed_dims != feat_channels
                    or enforce_decoder_input_project):
                self.decoder_input_projs.append(
                    Conv2d(
                        feat_channels, self.decoder_embed_dims, kernel_size=1))
            else:
                self.decoder_input_projs.append(nn.Identity())
        self.decoder_positional_encoding = build_positional_encoding(
            positional_encoding)
        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
        # from low resolution to high resolution
        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
                                        feat_channels)

        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
        self.mask_embed = nn.Sequential(
            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
            nn.Linear(feat_channels, out_channels))
        self.conv_seg = None  # fix a bug here (conv_seg is not used)

        self.test_cfg = test_cfg
        self.train_cfg = train_cfg
        if train_cfg:
            self.assigner = build_assigner(self.train_cfg.assigner)
            self.sampler = build_sampler(self.train_cfg.sampler, context=self)
            self.num_points = self.train_cfg.get('num_points', 12544)
            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
            self.importance_sample_ratio = self.train_cfg.get(
                'importance_sample_ratio', 0.75)

        self.class_weight = loss_cls.class_weight
        self.loss_cls = build_loss(loss_cls)
        self.loss_mask = build_loss(loss_mask)
        self.loss_dice = build_loss(loss_dice)

    def init_weights(self):
        for m in self.decoder_input_projs:
            if isinstance(m, Conv2d):
                caffe2_xavier_init(m, bias=0)

        self.pixel_decoder.init_weights()

        for p in self.transformer_decoder.parameters():
            if p.dim() > 1:
                nn.init.xavier_normal_(p)

    def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
                    gt_masks_list, img_metas):
        """Compute classification and mask targets for all images for a decoder
        layer.

        Args:
            cls_scores_list (list[Tensor]): Mask score logits from a single
                decoder layer for all images. Each with shape [num_queries,
                cls_out_channels].
            mask_preds_list (list[Tensor]): Mask logits from a single decoder
                layer for all images. Each with shape [num_queries, h, w].
            gt_labels_list (list[Tensor]): Ground truth class indices for all
                images. Each with shape (n, ), n is the sum of number of stuff
                type and number of instance in a image.
            gt_masks_list (list[Tensor]): Ground truth mask for each image,
                each with shape (n, h, w).
            img_metas (list[dict]): List of image meta information.

        Returns:
            tuple[list[Tensor]]: a tuple containing the following targets.

                - labels_list (list[Tensor]): Labels of all images.
                    Each with shape [num_queries, ].
                - label_weights_list (list[Tensor]): Label weights of all
                    images.Each with shape [num_queries, ].
                - mask_targets_list (list[Tensor]): Mask targets of all images.
                    Each with shape [num_queries, h, w].
                - mask_weights_list (list[Tensor]): Mask weights of all images.
                    Each with shape [num_queries, ].
                - num_total_pos (int): Number of positive samples in all
                    images.
                - num_total_neg (int): Number of negative samples in all
                    images.
        """
        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
         pos_inds_list,
         neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
                                      mask_preds_list, gt_labels_list,
                                      gt_masks_list, img_metas)

        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
        return (labels_list, label_weights_list, mask_targets_list,
                mask_weights_list, num_total_pos, num_total_neg)

    def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
                           img_metas):
        """Compute classification and mask targets for one image.

        Args:
            cls_score (Tensor): Mask score logits from a single decoder layer
                for one image. Shape (num_queries, cls_out_channels).
            mask_pred (Tensor): Mask logits for a single decoder layer for one
                image. Shape (num_queries, h, w).
            gt_labels (Tensor): Ground truth class indices for one image with
                shape (num_gts, ).
            gt_masks (Tensor): Ground truth mask for each image, each with
                shape (num_gts, h, w).
            img_metas (dict): Image informtation.

        Returns:
            tuple[Tensor]: A tuple containing the following for one image.

                - labels (Tensor): Labels of each image. \
                    shape (num_queries, ).
                - label_weights (Tensor): Label weights of each image. \
                    shape (num_queries, ).
                - mask_targets (Tensor): Mask targets of each image. \
                    shape (num_queries, h, w).
                - mask_weights (Tensor): Mask weights of each image. \
                    shape (num_queries, ).
                - pos_inds (Tensor): Sampled positive indices for each \
                    image.
                - neg_inds (Tensor): Sampled negative indices for each \
                    image.
        """
        # sample points
        num_queries = cls_score.shape[0]
        num_gts = gt_labels.shape[0]

        point_coords = torch.rand((1, self.num_points, 2),
                                  device=cls_score.device)
        # shape (num_queries, num_points)
        mask_points_pred = point_sample(
            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
                                                        1)).squeeze(1)
        # shape (num_gts, num_points)
        gt_points_masks = point_sample(
            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
                                                               1)).squeeze(1)

        # assign and sample
        assign_result = self.assigner.assign(cls_score, mask_points_pred,
                                             gt_labels, gt_points_masks,
                                             img_metas)
        sampling_result = self.sampler.sample(assign_result, mask_pred,
                                              gt_masks)
        pos_inds = sampling_result.pos_inds
        neg_inds = sampling_result.neg_inds

        # label target
        labels = gt_labels.new_full((self.num_queries, ),
                                    self.num_classes,
                                    dtype=torch.long)
        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
        label_weights = gt_labels.new_ones((self.num_queries, ))

        # mask target
        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
        mask_weights = mask_pred.new_zeros((self.num_queries, ))
        mask_weights[pos_inds] = 1.0

        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
                neg_inds)

    def loss_single(self, cls_scores, mask_preds, gt_labels_list,
                    gt_masks_list, img_metas):
        """Loss function for outputs from a single decoder layer.

        Args:
            cls_scores (Tensor): Mask score logits from a single decoder layer
                for all images. Shape (batch_size, num_queries,
                cls_out_channels). Note `cls_out_channels` should includes
                background.
            mask_preds (Tensor): Mask logits for a pixel decoder for all
                images. Shape (batch_size, num_queries, h, w).
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image, each with shape (num_gts, ).
            gt_masks_list (list[Tensor]): Ground truth mask for each image,
                each with shape (num_gts, h, w).
            img_metas (list[dict]): List of image meta information.

        Returns:
            tuple[Tensor]: Loss components for outputs from a single \
                decoder layer.
        """
        num_imgs = cls_scores.size(0)
        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
         num_total_pos,
         num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
                                           gt_labels_list, gt_masks_list,
                                           img_metas)
        # shape (batch_size, num_queries)
        labels = torch.stack(labels_list, dim=0)
        # shape (batch_size, num_queries)
        label_weights = torch.stack(label_weights_list, dim=0)
        # shape (num_total_gts, h, w)
        mask_targets = torch.cat(mask_targets_list, dim=0)
        # shape (batch_size, num_queries)
        mask_weights = torch.stack(mask_weights_list, dim=0)

        # classfication loss
        # shape (batch_size * num_queries, )
        cls_scores = cls_scores.flatten(0, 1)
        labels = labels.flatten(0, 1)
        label_weights = label_weights.flatten(0, 1)

        class_weight = cls_scores.new_tensor(self.class_weight)
        loss_cls = self.loss_cls(
            cls_scores,
            labels,
            label_weights,
            avg_factor=class_weight[labels].sum())

        num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
        num_total_masks = max(num_total_masks, 1)

        # extract positive ones
        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
        mask_preds = mask_preds[mask_weights > 0]

        if mask_targets.shape[0] == 0:
            # zero match
            loss_dice = mask_preds.sum()
            loss_mask = mask_preds.sum()
            return loss_cls, loss_mask, loss_dice

        with torch.no_grad():
            points_coords = get_uncertain_point_coords_with_randomness(
                mask_preds.unsqueeze(1), None, self.num_points,
                self.oversample_ratio, self.importance_sample_ratio)
            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
            mask_point_targets = point_sample(
                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
        # shape (num_queries, h, w) -> (num_queries, num_points)
        mask_point_preds = point_sample(
            mask_preds.unsqueeze(1), points_coords).squeeze(1)

        # dice loss
        loss_dice = self.loss_dice(
            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)

        # mask loss
        # shape (num_queries, num_points) -> (num_queries * num_points, )
        mask_point_preds = mask_point_preds.reshape(-1, 1)
        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
        mask_point_targets = mask_point_targets.reshape(-1)
        loss_mask = self.loss_mask(
            mask_point_preds,
            mask_point_targets,
            avg_factor=num_total_masks * self.num_points)

        return loss_cls, loss_mask, loss_dice

    @force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
    def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
             gt_masks_list, img_metas):
        """Loss function.

        Args:
            all_cls_scores (Tensor): Classification scores for all decoder
                layers with shape [num_decoder, batch_size, num_queries,
                cls_out_channels].
            all_mask_preds (Tensor): Mask scores for all decoder layers with
                shape [num_decoder, batch_size, num_queries, h, w].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (n, ). n is the sum of number of stuff type
                and number of instance in a image.
            gt_masks_list (list[Tensor]): Ground truth mask for each image with
                shape (n, h, w).
            img_metas (list[dict]): List of image meta information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        num_dec_layers = len(all_cls_scores)
        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
        all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
        img_metas_list = [img_metas for _ in range(num_dec_layers)]
        losses_cls, losses_mask, losses_dice = multi_apply(
            self.loss_single, all_cls_scores, all_mask_preds,
            all_gt_labels_list, all_gt_masks_list, img_metas_list)

        loss_dict = dict()
        # loss from the last decoder layer
        loss_dict['loss_cls'] = losses_cls[-1]
        loss_dict['loss_mask'] = losses_mask[-1]
        loss_dict['loss_dice'] = losses_dice[-1]
        # loss from other decoder layers
        num_dec_layer = 0
        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
            num_dec_layer += 1
        return loss_dict

    def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
        """Forward for head part which is called after every decoder layer.

        Args:
            decoder_out (Tensor): in shape (num_queries, batch_size, c).
            mask_feature (Tensor): in shape (batch_size, c, h, w).
            attn_mask_target_size (tuple[int, int]): target attention
                mask size.

        Returns:
            tuple: A tuple contain three elements.

            - cls_pred (Tensor): Classification scores in shape \
                (batch_size, num_queries, cls_out_channels). \
                Note `cls_out_channels` should includes background.
            - mask_pred (Tensor): Mask scores in shape \
                (batch_size, num_queries,h, w).
            - attn_mask (Tensor): Attention mask in shape \
                (batch_size * num_heads, num_queries, h, w).
        """
        decoder_out = self.transformer_decoder.post_norm(decoder_out)
        decoder_out = decoder_out.transpose(0, 1)
        # shape (num_queries, batch_size, c)
        cls_pred = self.cls_embed(decoder_out)
        # shape (num_queries, batch_size, c)
        mask_embed = self.mask_embed(decoder_out)
        # shape (num_queries, batch_size, h, w)
        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
        attn_mask = F.interpolate(
            mask_pred,
            attn_mask_target_size,
            mode='bilinear',
            align_corners=False)
        # shape (num_queries, batch_size, h, w) ->
        #   (batch_size * num_head, num_queries, h, w)
        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
            (1, self.num_heads, 1, 1)).flatten(0, 1)
        attn_mask = attn_mask.sigmoid() < 0.5
        attn_mask = attn_mask.detach()

        return cls_pred, mask_pred, attn_mask

    def forward(self, feats, img_metas):
        """Forward function.

        Args:
            feats (list[Tensor]): Multi scale Features from the
                upstream network, each is a 4D-tensor.
            img_metas (list[dict]): List of image information.

        Returns:
            tuple: A tuple contains two elements.

            - cls_pred_list (list[Tensor)]: Classification logits \
                for each decoder layer. Each is a 3D-tensor with shape \
                (batch_size, num_queries, cls_out_channels). \
                Note `cls_out_channels` should includes background.
            - mask_pred_list (list[Tensor]): Mask logits for each \
                decoder layer. Each with shape (batch_size, num_queries, \
                 h, w).
        """
        batch_size = len(img_metas)
        mask_features, multi_scale_memorys = self.pixel_decoder(feats)
        # multi_scale_memorys (from low resolution to high resolution)
        decoder_inputs = []
        decoder_positional_encodings = []
        for i in range(self.num_transformer_feat_level):
            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
            decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
            level_embed = self.level_embed.weight[i].view(1, 1, -1)
            decoder_input = decoder_input + level_embed
            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
            mask = decoder_input.new_zeros(
                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
                dtype=torch.bool)
            decoder_positional_encoding = self.decoder_positional_encoding(
                mask)
            decoder_positional_encoding = decoder_positional_encoding.flatten(
                2).permute(2, 0, 1)
            decoder_inputs.append(decoder_input)
            decoder_positional_encodings.append(decoder_positional_encoding)
        # shape (num_queries, c) -> (num_queries, batch_size, c)
        query_feat = self.query_feat.weight.unsqueeze(1).repeat(
            (1, batch_size, 1))
        query_embed = self.query_embed.weight.unsqueeze(1).repeat(
            (1, batch_size, 1))

        cls_pred_list = []
        mask_pred_list = []
        cls_pred, mask_pred, attn_mask = self.forward_head(
            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
        cls_pred_list.append(cls_pred)
        mask_pred_list.append(mask_pred)

        for i in range(self.num_transformer_decoder_layers):
            level_idx = i % self.num_transformer_feat_level
            # if a mask is all True(all background), then set it all False.
            attn_mask[torch.where(
                attn_mask.sum(-1) == attn_mask.shape[-1])] = False

            # cross_attn + self_attn
            layer = self.transformer_decoder.layers[i]
            attn_masks = [attn_mask, None]
            query_feat = layer(
                query=query_feat,
                key=decoder_inputs[level_idx],
                value=decoder_inputs[level_idx],
                query_pos=query_embed,
                key_pos=decoder_positional_encodings[level_idx],
                attn_masks=attn_masks,
                query_key_padding_mask=None,
                # here we do not apply masking on padded region
                key_padding_mask=None)
            cls_pred, mask_pred, attn_mask = self.forward_head(
                query_feat, mask_features, multi_scale_memorys[
                    (i + 1) % self.num_transformer_feat_level].shape[-2:])

            cls_pred_list.append(cls_pred)
            mask_pred_list.append(mask_pred)

        return cls_pred_list, mask_pred_list

    def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels,
                      gt_masks):
        """Forward function for training mode.

        Args:
            x (list[Tensor]): Multi-level features from the upstream network,
                each is a 4D-tensor.
            img_metas (list[Dict]): List of image information.
            gt_semantic_seg (list[tensor]):Each element is the ground truth
                of semantic segmentation with the shape (N, H, W).
            train_cfg (dict): The training config, which not been used in
                maskformer.
            gt_labels (list[Tensor]): Each element is ground truth labels of
                each box, shape (num_gts,).
            gt_masks (list[BitmapMasks]): Each element is masks of instances
                of a image, shape (num_gts, h, w).

        Returns:
            losses (dict[str, Tensor]): a dictionary of loss components
        """

        # forward
        all_cls_scores, all_mask_preds = self(x, img_metas)

        # loss
        losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
                           img_metas)

        return losses

    def forward_test(self, inputs, img_metas, test_cfg):
        """Test segment without test-time aumengtation.

        Only the output of last decoder layers was used.

        Args:
            inputs (list[Tensor]): Multi-level features from the
                upstream network, each is a 4D-tensor.
            img_metas (list[dict]): List of image information.
            test_cfg (dict): Testing config.

        Returns:
            seg_mask (Tensor): Predicted semantic segmentation logits.
        """
        all_cls_scores, all_mask_preds = self(inputs, img_metas)
        cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
        ori_h, ori_w, _ = img_metas[0]['ori_shape']

        # semantic inference
        cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
        mask_pred = mask_pred.sigmoid()
        seg_mask = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
        return seg_mask
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/init.py
@@ -0,0 +1,3 @@
 from .encoder_decoder_mask2former import EncoderDecoderMask2Former

 __all__ = ['EncoderDecoderMask2Former']
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
@@ -0,0 +1,314 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git
 import warnings
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict

 import mmcv
 import numpy as np
 import torch
 import torch.distributed as dist
 from mmcv.runner import BaseModule, auto_fp16


 class BaseSegmentor(BaseModule, metaclass=ABCMeta):
    """Base class for segmentors."""

    def __init__(self, init_cfg=None):
        super(BaseSegmentor, self).__init__(init_cfg)
        self.fp16_enabled = False

    @property
    def with_neck(self):
        """bool: whether the segmentor has neck"""
        return hasattr(self, 'neck') and self.neck is not None

    @property
    def with_auxiliary_head(self):
        """bool: whether the segmentor has auxiliary head"""
        return hasattr(self,
                       'auxiliary_head') and self.auxiliary_head is not None

    @property
    def with_decode_head(self):
        """bool: whether the segmentor has decode head"""
        return hasattr(self, 'decode_head') and self.decode_head is not None

    @abstractmethod
    def extract_feat(self, imgs):
        """Placeholder for extract features from images."""
        pass

    @abstractmethod
    def encode_decode(self, img, img_metas):
        """Placeholder for encode images with backbone and decode into a
        semantic segmentation map of the same size as input."""
        pass

    @abstractmethod
    def forward_train(self, imgs, img_metas, **kwargs):
        """Placeholder for Forward function for training."""
        pass

    @abstractmethod
    def simple_test(self, img, img_meta, **kwargs):
        """Placeholder for single image test."""
        pass

    @abstractmethod
    def aug_test(self, imgs, img_metas, **kwargs):
        """Placeholder for augmentation test."""
        pass

    def forward_test(self, imgs, img_metas, **kwargs):
        """
        Args:
            imgs (List[Tensor]): the outer list indicates test-time
                augmentations and inner Tensor should have a shape NxCxHxW,
                which contains all images in the batch.
            img_metas (List[List[dict]]): the outer list indicates test-time
                augs (multiscale, flip, etc.) and the inner list indicates
                images in a batch.
        """
        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
            if not isinstance(var, list):
                raise TypeError(f'{name} must be a list, but got '
                                f'{type(var)}')

        num_augs = len(imgs)
        if num_augs != len(img_metas):
            raise ValueError(f'num of augmentations ({len(imgs)}) != '
                             f'num of image meta ({len(img_metas)})')

        # all images in the same aug batch all of the same ori_shape and pad
        # shape
        def tensor_to_tuple(input_tensor):
            return tuple(input_tensor.cpu().numpy())

        for img_meta in img_metas:
            ori_shapes = [_['ori_shape'] for _ in img_meta]
            if isinstance(ori_shapes[0], torch.Tensor):
                assert all(
                    tensor_to_tuple(shape) == tensor_to_tuple(ori_shapes[0])
                    for shape in ori_shapes)
            else:
                assert all(shape == ori_shapes[0] for shape in ori_shapes)

            img_shapes = [_['img_shape'] for _ in img_meta]
            if isinstance(img_shapes[0], torch.Tensor):
                assert all(
                    tensor_to_tuple(shape) == tensor_to_tuple(img_shapes[0])
                    for shape in img_shapes)
            else:
                assert all(shape == img_shapes[0] for shape in img_shapes)

            pad_shapes = [_['pad_shape'] for _ in img_meta]
            if isinstance(pad_shapes[0], torch.Tensor):
                assert all(
                    tensor_to_tuple(shape) == tensor_to_tuple(pad_shapes[0])
                    for shape in pad_shapes)
            else:
                assert all(shape == pad_shapes[0] for shape in pad_shapes)

        if num_augs == 1:
            return self.simple_test(imgs[0], img_metas[0], **kwargs)
        else:
            return self.aug_test(imgs, img_metas, **kwargs)

    @auto_fp16(apply_to=('img', ))
    def forward(self, img, img_metas, return_loss=True, **kwargs):
        """Calls either :func:`forward_train` or :func:`forward_test` depending
        on whether ``return_loss`` is ``True``.

        Note this setting will change the expected inputs. When
        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
        and List[dict]), and when ``resturn_loss=False``, img and img_meta
        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
        the outer list indicating test time augmentations.
        """
        if return_loss:
            return self.forward_train(img, img_metas, **kwargs)
        else:
            return self.forward_test(img, img_metas, **kwargs)

    def train_step(self, data_batch, optimizer, **kwargs):
        """The iteration step during training.

        This method defines an iteration step during training, except for the
        back propagation and optimizer updating, which are done in an optimizer
        hook. Note that in some complicated cases or models, the whole process
        including back propagation and optimizer updating is also defined in
        this method, such as GAN.

        Args:
            data (dict): The output of dataloader.
            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
                runner is passed to ``train_step()``. This argument is unused
                and reserved.

        Returns:
            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
                ``num_samples``.
                ``loss`` is a tensor for back propagation, which can be a
                weighted sum of multiple losses.
                ``log_vars`` contains all the variables to be sent to the
                logger.
                ``num_samples`` indicates the batch size (when the model is
                DDP, it means the batch size on each GPU), which is used for
                averaging the logs.
        """
        losses = self(**data_batch)
        loss, log_vars = self._parse_losses(losses)

        outputs = dict(
            loss=loss,
            log_vars=log_vars,
            num_samples=len(data_batch['img_metas']))

        return outputs

    def val_step(self, data_batch, optimizer=None, **kwargs):
        """The iteration step during validation.

        This method shares the same signature as :func:`train_step`, but used
        during val epochs. Note that the evaluation after training epochs is
        not implemented with this method, but an evaluation hook.
        """
        losses = self(**data_batch)
        loss, log_vars = self._parse_losses(losses)

        log_vars_ = dict()
        for loss_name, loss_value in log_vars.items():
            k = loss_name + '_val'
            log_vars_[k] = loss_value

        outputs = dict(
            loss=loss,
            log_vars=log_vars_,
            num_samples=len(data_batch['img_metas']))

        return outputs

    @staticmethod
    def _parse_losses(losses):
        """Parse the raw outputs (losses) of the network.

        Args:
            losses (dict): Raw output of the network, which usually contain
                losses and other necessary information.

        Returns:
            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
                which may be a weighted sum of all losses, log_vars contains
                all the variables to be sent to the logger.
        """
        log_vars = OrderedDict()
        for loss_name, loss_value in losses.items():
            if isinstance(loss_value, torch.Tensor):
                log_vars[loss_name] = loss_value.mean()
            elif isinstance(loss_value, list):
                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
            else:
                raise TypeError(
                    f'{loss_name} is not a tensor or list of tensors')

        loss = sum(_value for _key, _value in log_vars.items()
                   if 'loss' in _key)

        # If the loss_vars has different length, raise assertion error
        # to prevent GPUs from infinite waiting.
        if dist.is_available() and dist.is_initialized():
            log_var_length = torch.tensor(len(log_vars), device=loss.device)
            dist.all_reduce(log_var_length)
            message = (f'rank {dist.get_rank()}'
                       + f' len(log_vars): {len(log_vars)}' + ' keys: '
                       + ','.join(log_vars.keys()) + '\n')
            assert log_var_length == len(log_vars) * dist.get_world_size(), \
                'loss log variables are different across GPUs!\n' + message

        log_vars['loss'] = loss
        for loss_name, loss_value in log_vars.items():
            # reduce loss when distributed training
            if dist.is_available() and dist.is_initialized():
                loss_value = loss_value.data.clone()
                dist.all_reduce(loss_value.div_(dist.get_world_size()))
            log_vars[loss_name] = loss_value.item()

        return loss, log_vars

    def show_result(self,
                    img,
                    result,
                    palette=None,
                    win_name='',
                    show=False,
                    wait_time=0,
                    out_file=None,
                    opacity=0.5):
        """Draw `result` over `img`.

        Args:
            img (str or Tensor): The image to be displayed.
            result (Tensor): The semantic segmentation results to draw over
                `img`.
            palette (list[list[int]]] | np.ndarray | None): The palette of
                segmentation map. If None is given, random palette will be
                generated. Default: None
            win_name (str): The window name.
            wait_time (int): Value of waitKey param.
                Default: 0.
            show (bool): Whether to show the image.
                Default: False.
            out_file (str or None): The filename to write the image.
                Default: None.
            opacity(float): Opacity of painted segmentation map.
                Default 0.5.
                Must be in (0, 1] range.
        Returns:
            img (Tensor): Only if not `show` or `out_file`
        """
        img = mmcv.imread(img)
        img = img.copy()
        seg = result[0]
        if palette is None:
            if self.PALETTE is None:
                # Get random state before set seed,
                # and restore random state later.
                # It will prevent loss of randomness, as the palette
                # may be different in each iteration if not specified.
                # See: https://github.com/open-mmlab/mmdetection/issues/5844
                state = np.random.get_state()
                np.random.seed(42)
                # random palette
                palette = np.random.randint(
                    0, 255, size=(len(self.CLASSES), 3))
                np.random.set_state(state)
            else:
                palette = self.PALETTE
        palette = np.array(palette)
        assert palette.shape[0] == len(self.CLASSES)
        assert palette.shape[1] == 3
        assert len(palette.shape) == 2
        assert 0 < opacity <= 1.0
        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
        for label, color in enumerate(palette):
            color_seg[seg == label, :] = color
        # convert to BGR
        color_seg = color_seg[..., ::-1]

        img = img * (1 - opacity) + color_seg * opacity
        img = img.astype(np.uint8)
        # if out_file specified, do not show image in window
        if out_file is not None:
            show = False

        if show:
            mmcv.imshow(img, win_name, wait_time)
        if out_file is not None:
            mmcv.imwrite(img, out_file)

        if not (show or out_file):
            warnings.warn('show==False and out_file is not specified, only '
                          'result image will be returned')
            return img
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
@@ -0,0 +1,303 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmdet.models import builder
 from mmdet.models.builder import DETECTORS

 from ...utils import add_prefix, seg_resize
 from .base_segmentor import BaseSegmentor


@DETECTORS.register_module()
 class EncoderDecoderMask2Former(BaseSegmentor):
    """Encoder Decoder segmentors.

    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
    Note that auxiliary_head is only used for deep supervision during training,
    which could be dumped during inference.
    """

    def __init__(self,
                 backbone,
                 decode_head,
                 neck=None,
                 auxiliary_head=None,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None,
                 init_cfg=None):
        super(EncoderDecoderMask2Former, self).__init__(init_cfg)
        if pretrained is not None:
            assert backbone.get('pretrained') is None, \
                'both backbone and segmentor set pretrained weight'
            backbone.pretrained = pretrained
        self.backbone = builder.build_backbone(backbone)
        if neck is not None:
            self.neck = builder.build_neck(neck)
        decode_head.update(train_cfg=train_cfg)
        decode_head.update(test_cfg=test_cfg)
        self._init_decode_head(decode_head)
        self._init_auxiliary_head(auxiliary_head)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

        assert self.with_decode_head

    def _init_decode_head(self, decode_head):
        """Initialize ``decode_head``"""
        self.decode_head = builder.build_head(decode_head)
        self.align_corners = self.decode_head.align_corners
        self.num_classes = self.decode_head.num_classes

    def _init_auxiliary_head(self, auxiliary_head):
        """Initialize ``auxiliary_head``"""
        if auxiliary_head is not None:
            if isinstance(auxiliary_head, list):
                self.auxiliary_head = nn.ModuleList()
                for head_cfg in auxiliary_head:
                    self.auxiliary_head.append(builder.build_head(head_cfg))
            else:
                self.auxiliary_head = builder.build_head(auxiliary_head)

    def extract_feat(self, img):
        """Extract features from images."""
        x = self.backbone(img)
        if self.with_neck:
            x = self.neck(x)
        return x

    def encode_decode(self, img, img_metas):
        """Encode images with backbone and decode into a semantic segmentation
        map of the same size as input."""
        x = self.extract_feat(img)
        out = self._decode_head_forward_test(x, img_metas)
        out = seg_resize(
            input=out,
            size=img.shape[2:],
            mode='bilinear',
            align_corners=self.align_corners)
        return out

    def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg,
                                   **kwargs):
        """Run forward function and calculate loss for decode head in
        training."""
        losses = dict()
        loss_decode = self.decode_head.forward_train(x, img_metas,
                                                     gt_semantic_seg, **kwargs)

        losses.update(add_prefix(loss_decode, 'decode'))
        return losses

    def _decode_head_forward_test(self, x, img_metas):
        """Run forward function and calculate loss for decode head in
        inference."""
        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
        return seg_logits

    def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
        """Run forward function and calculate loss for auxiliary head in
        training."""
        losses = dict()
        if isinstance(self.auxiliary_head, nn.ModuleList):
            for idx, aux_head in enumerate(self.auxiliary_head):
                loss_aux = aux_head.forward_train(x, img_metas,
                                                  gt_semantic_seg,
                                                  self.train_cfg)
                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
        else:
            loss_aux = self.auxiliary_head.forward_train(
                x, img_metas, gt_semantic_seg, self.train_cfg)
            losses.update(add_prefix(loss_aux, 'aux'))

        return losses

    def forward_dummy(self, img):
        """Dummy forward function."""
        seg_logit = self.encode_decode(img, None)

        return seg_logit

    def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs):
        """Forward function for training.

        Args:
            img (Tensor): Input images.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            gt_semantic_seg (Tensor): Semantic segmentation masks
                used if the architecture supports semantic segmentation task.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """

        x = self.extract_feat(img)

        losses = dict()

        loss_decode = self._decode_head_forward_train(x, img_metas,
                                                      gt_semantic_seg,
                                                      **kwargs)
        losses.update(loss_decode)

        if self.with_auxiliary_head:
            loss_aux = self._auxiliary_head_forward_train(
                x, img_metas, gt_semantic_seg)
            losses.update(loss_aux)

        return losses

    # TODO refactor
    def slide_inference(self, img, img_meta, rescale):
        """Inference by sliding-window with overlap.

        If h_crop > h_img or w_crop > w_img, the small patch will be used to
        decode without padding.
        """

        h_stride, w_stride = self.test_cfg.stride
        h_crop, w_crop = self.test_cfg.crop_size
        batch_size, _, h_img, w_img = img.size()
        num_classes = self.num_classes
        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
        preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
        count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
        for h_idx in range(h_grids):
            for w_idx in range(w_grids):
                y1 = h_idx * h_stride
                x1 = w_idx * w_stride
                y2 = min(y1 + h_crop, h_img)
                x2 = min(x1 + w_crop, w_img)
                y1 = max(y2 - h_crop, 0)
                x1 = max(x2 - w_crop, 0)
                crop_img = img[:, :, y1:y2, x1:x2]
                crop_seg_logit = self.encode_decode(crop_img, img_meta)
                preds += F.pad(crop_seg_logit,
                               (int(x1), int(preds.shape[3] - x2), int(y1),
                                int(preds.shape[2] - y2)))

                count_mat[:, :, y1:y2, x1:x2] += 1
        assert (count_mat == 0).sum() == 0
        if torch.onnx.is_in_onnx_export():
            # cast count_mat to constant while exporting to ONNX
            count_mat = torch.from_numpy(
                count_mat.cpu().detach().numpy()).to(device=img.device)
        preds = preds / count_mat

        def tensor_to_tuple(input_tensor):
            return tuple(input_tensor.cpu().numpy())

        if rescale:
            preds = seg_resize(
                preds,
                size=tensor_to_tuple(img_meta[0]['ori_shape'])[:2]
                if isinstance(img_meta[0]['ori_shape'], torch.Tensor) else
                img_meta[0]['ori_shape'],
                mode='bilinear',
                align_corners=self.align_corners,
                warning=False)
        return preds

    def whole_inference(self, img, img_meta, rescale):
        """Inference with full image."""

        seg_logit = self.encode_decode(img, img_meta)
        if rescale:
            # support dynamic shape for onnx
            if torch.onnx.is_in_onnx_export():
                size = img.shape[2:]
            else:
                size = img_meta[0]['ori_shape'][:2]
            seg_logit = seg_resize(
                seg_logit,
                size=size,
                mode='bilinear',
                align_corners=self.align_corners,
                warning=False)

        return seg_logit

    def inference(self, img, img_meta, rescale):
        """Inference with slide/whole style.

        Args:
            img (Tensor): The input image of shape (N, 3, H, W).
            img_meta (dict): Image info dict where each dict has: 'img_shape',
                'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            rescale (bool): Whether rescale back to original shape.

        Returns:
            Tensor: The output segmentation map.
        """

        assert self.test_cfg.mode in ['slide', 'whole']
        ori_shape = img_meta[0]['ori_shape']

        def tensor_to_tuple(input_tensor):
            return tuple(input_tensor.cpu().numpy())

        if isinstance(ori_shape, torch.Tensor):
            assert all(
                tensor_to_tuple(_['ori_shape']) == tensor_to_tuple(ori_shape)
                for _ in img_meta)
        else:
            assert all(_['ori_shape'] == ori_shape for _ in img_meta)
        if self.test_cfg.mode == 'slide':
            seg_logit = self.slide_inference(img, img_meta, rescale)
        else:
            seg_logit = self.whole_inference(img, img_meta, rescale)
        output = F.softmax(seg_logit, dim=1)
        flip = img_meta[0]['flip']
        if flip:
            flip_direction = img_meta[0]['flip_direction']
            assert flip_direction in ['horizontal', 'vertical']
            if flip_direction == 'horizontal':
                output = output.flip(dims=(3, ))
            elif flip_direction == 'vertical':
                output = output.flip(dims=(2, ))

        return output

    def simple_test(self, img, img_meta, rescale=True):
        """Simple test with single image."""
        seg_logit = self.inference(img, img_meta, rescale)
        seg_pred = seg_logit.argmax(dim=1)
        if torch.onnx.is_in_onnx_export():
            # our inference backend only support 4D output
            seg_pred = seg_pred.unsqueeze(0)
            return seg_pred
        seg_pred = seg_pred.cpu().numpy()
        # unravel batch dim
        seg_pred = list(seg_pred)
        return seg_pred

    def aug_test(self, imgs, img_metas, rescale=True):
        """Test with augmentations.

        Only rescale=True is supported.
        """
        # aug_test rescale all imgs back to ori_shape for now
        assert rescale
        # to save memory, we get augmented seg logit inplace
        seg_logit = self.inference(imgs[0], img_metas[0], rescale)
        for i in range(1, len(imgs)):
            cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
            seg_logit += cur_seg_logit
        seg_logit /= len(imgs)
        seg_pred = seg_logit.argmax(dim=1)
        seg_pred = seg_pred.cpu().numpy()
        # unravel batch dim
        seg_pred = list(seg_pred)
        return seg_pred
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/init.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/init.py
@@ -0,0 +1,7 @@
 from .builder import build_pixel_sampler
 from .data_process_func import ResizeToMultiple
 from .seg_func import add_prefix, seg_resize

 __all__ = [
    'seg_resize', 'add_prefix', 'build_pixel_sampler', 'ResizeToMultiple'
 ]
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
@@ -0,0 +1,11 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git
 from mmcv.utils import Registry, build_from_cfg

 PIXEL_SAMPLERS = Registry('pixel sampler')


 def build_pixel_sampler(cfg, **default_args):
    """Build pixel sampler for segmentation map."""
    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
@@ -0,0 +1,60 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import mmcv
 from mmdet.datasets.builder import PIPELINES


@PIPELINES.register_module()
 class ResizeToMultiple(object):
    """Resize images & seg to multiple of divisor.

    Args:
        size_divisor (int): images and gt seg maps need to resize to multiple
            of size_divisor. Default: 32.
        interpolation (str, optional): The interpolation mode of image resize.
            Default: None
    """

    def __init__(self, size_divisor=32, interpolation=None):
        self.size_divisor = size_divisor
        self.interpolation = interpolation

    def __call__(self, results):
        """Call function to resize images, semantic segmentation map to
        multiple of size divisor.

        Args:
            results (dict): Result dict from loading pipeline.

        Returns:
            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
        """
        # Align image to multiple of size divisor.
        img = results['img']
        img = mmcv.imresize_to_multiple(
            img,
            self.size_divisor,
            scale_factor=1,
            interpolation=self.interpolation
            if self.interpolation else 'bilinear')

        results['img'] = img
        results['img_shape'] = img.shape
        results['pad_shape'] = img.shape

        # Align segmentation map to multiple of size divisor.
        for key in results.get('seg_fields', []):
            gt_seg = results[key]
            gt_seg = mmcv.imresize_to_multiple(
                gt_seg,
                self.size_divisor,
                scale_factor=1,
                interpolation='nearest')
            results[key] = gt_seg

        return results

    def __repr__(self):
        repr_str = self.__class__.__name__
        repr_str += (f'(size_divisor={self.size_divisor}, '
                     f'interpolation={self.interpolation})')
        return repr_str
--- a/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
+++ b/modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
@@ -0,0 +1,48 @@
 # The implementation refers to the VitAdapter
 # available at
 # https://github.com/czczup/ViT-Adapter.git

 import warnings

 import torch.nn.functional as F


 def seg_resize(input,
               size=None,
               scale_factor=None,
               mode='nearest',
               align_corners=None,
               warning=True):
    if warning:
        if size is not None and align_corners:
            input_h, input_w = tuple(int(x) for x in input.shape[2:])
            output_h, output_w = tuple(int(x) for x in size)
            if output_h > input_h or output_w > input_w:
                if ((output_h > 1 and output_w > 1 and input_h > 1
                     and input_w > 1) and (output_h - 1) % (input_h - 1)
                        and (output_w - 1) % (input_w - 1)):
                    warnings.warn(
                        f'When align_corners={align_corners}, '
                        'the output would more aligned if '
                        f'input size {(input_h, input_w)} is `x+1` and '
                        f'out size {(output_h, output_w)} is `nx+1`')
    return F.interpolate(input, size, scale_factor, mode, align_corners)


 def add_prefix(inputs, prefix):
    """Add prefix for dict.

    Args:
        inputs (dict): The input dict with str keys.
        prefix (str): The prefix to add.

    Returns:

        dict: The dict with keys updated with ``prefix``.
    """

    outputs = dict()
    for name, value in inputs.items():
        outputs[f'{prefix}.{name}'] = value

    return outputs
--- a/modelscope/models/cv/movie_scene_segmentation/init.py
+++ b/modelscope/models/cv/movie_scene_segmentation/init.py
@@ -0,0 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:

    from .model import MovieSceneSegmentationModel
    from .datasets import MovieSceneSegmentationDataset

 else:
    _import_structure = {
        'model': ['MovieSceneSegmentationModel'],
        'datasets': ['MovieSceneSegmentationDataset'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/movie_scene_segmentation/get_model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/get_model.py
@@ -0,0 +1,45 @@
 # ------------------------------------------------------------------------------------
 # BaSSL
 # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 # Github: https://github.com/kakaobrain/bassl
 # ------------------------------------------------------------------------------------

 from .utils.shot_encoder import resnet50
 from .utils.trn import TransformerCRN


 def get_shot_encoder(cfg):
    name = cfg['model']['shot_encoder']['name']
    shot_encoder_args = cfg['model']['shot_encoder'][name]
    if name == 'resnet':
        depth = shot_encoder_args['depth']
        if depth == 50:
            shot_encoder = resnet50(**shot_encoder_args['params'], )
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError

    return shot_encoder


 def get_contextual_relation_network(cfg):
    crn = None

    if cfg['model']['contextual_relation_network']['enabled']:
        name = cfg['model']['contextual_relation_network']['name']
        crn_args = cfg['model']['contextual_relation_network']['params'][name]
        if name == 'trn':
            sampling_name = cfg['model']['loss']['sampling_method']['name']
            crn_args['neighbor_size'] = (
                2 * cfg['model']['loss']['sampling_method']['params']
                [sampling_name]['neighbor_size'])
            crn = TransformerCRN(crn_args)
        else:
            raise NotImplementedError

    return crn


 __all__ = ['get_shot_encoder', 'get_contextual_relation_network']
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -0,0 +1,192 @@
 import os
 import os.path as osp
 from typing import Any, Dict

 import einops
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.transforms as TF
 from PIL import Image
 from shotdetect_scenedetect_lgss import shot_detect

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from .get_model import get_contextual_relation_network, get_shot_encoder
 from .utils.save_op import get_pred_boundary, pred2scene, scene2video

 logger = get_logger()


@MODELS.register_module(
    Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
 class MovieSceneSegmentationModel(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        """str -- model file root."""
        super().__init__(model_dir, *args, **kwargs)

        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        params = torch.load(model_path, map_location='cpu')

        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
        self.cfg = Config.from_file(config_path)

        def load_param_with_prefix(prefix, model, src_params):
            own_state = model.state_dict()
            for name, param in own_state.items():
                src_name = prefix + '.' + name
                own_state[name] = src_params[src_name]

            model.load_state_dict(own_state)

        self.shot_encoder = get_shot_encoder(self.cfg)
        load_param_with_prefix('shot_encoder', self.shot_encoder, params)
        self.crn = get_contextual_relation_network(self.cfg)
        load_param_with_prefix('crn', self.crn, params)

        crn_name = self.cfg.model.contextual_relation_network.name
        hdim = self.cfg.model.contextual_relation_network.params[crn_name][
            'hidden_size']
        self.head_sbd = nn.Linear(hdim, 2)
        load_param_with_prefix('head_sbd', self.head_sbd, params)

        self.test_transform = TF.Compose([
            TF.Resize(size=256, interpolation=Image.BICUBIC),
            TF.CenterCrop(224),
            TF.ToTensor(),
            TF.Normalize(
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
        sampling_method = self.cfg.dataset.sampling_method.name
        self.neighbor_size = self.cfg.dataset.sampling_method.params[
            sampling_method].neighbor_size

        self.eps = 1e-5

    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        data = inputs['video']
        labels = inputs['label']
        outputs = self.shared_step(data)

        loss = F.cross_entropy(
            outputs.squeeze(), labels.squeeze(), reduction='none')
        lpos = labels == 1
        lneg = labels == 0

        pp, nn = 1, 1
        wp = (pp / float(pp + nn)) * lpos / (lpos.sum() + self.eps)
        wn = (nn / float(pp + nn)) * lneg / (lneg.sum() + self.eps)
        w = wp + wn
        loss = (w * loss).sum()

        probs = torch.argmax(outputs, dim=1)

        re = dict(pred=probs, loss=loss)
        return re

    def inference(self, batch):
        logger.info('Begin scene detect ......')
        bs = self.cfg.pipeline.batch_size_per_gpu
        sids = batch['sid']
        inputs = batch['shot_feat']

        shot_num = len(sids)
        cnt = shot_num // bs + 1

        for i in range(cnt):
            start = i * bs
            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
            input_ = inputs[start:end]
            sid_ = sids[start:end]
            input_ = torch.stack(input_)
            outputs = self.shared_step(input_)  # shape [b,2]
            prob = F.softmax(outputs, dim=1)
            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
        self.infer_result['pred'] = np.stack(self.infer_result['pred'])

        assert len(self.infer_result['sid']) == len(sids)
        assert len(self.infer_result['pred']) == len(inputs)
        return self.infer_result

    def shared_step(self, inputs):
        with torch.no_grad():
            # infer shot encoder
            shot_repr = self.extract_shot_representation(inputs)
            assert len(shot_repr.shape) == 3

        # infer CRN
        _, pooled = self.crn(shot_repr, mask=None)
        # infer boundary score
        pred = self.head_sbd(pooled)
        return pred

    def save_shot_feat(self, _repr):
        feat = _repr.float().cpu().numpy()
        pth = self.cfg.dataset.img_path + '/features'
        os.makedirs(pth)

        for idx in range(_repr.shape[0]):
            name = f'shot_{str(idx).zfill(4)}.npy'
            name = osp.join(pth, name)
            np.save(name, feat[idx])

    def extract_shot_representation(self,
                                    inputs: torch.Tensor) -> torch.Tensor:
        """ inputs [b s k c h w] -> output [b d] """
        assert len(inputs.shape) == 6  # (B Shot Keyframe C H W)
        b, s, k, c, h, w = inputs.shape
        inputs = einops.rearrange(inputs, 'b s k c h w -> (b s) k c h w', s=s)
        keyframe_repr = [self.shot_encoder(inputs[:, _k]) for _k in range(k)]
        # [k (b s) d] -> [(b s) d]
        shot_repr = torch.stack(keyframe_repr).mean(dim=0)

        shot_repr = einops.rearrange(shot_repr, '(b s) d -> b s d', s=s)
        return shot_repr

    def postprocess(self, inputs: Dict[str, Any], **kwargs):
        logger.info('Generate scene .......')

        pred_dict = inputs['feat']
        thres = self.cfg.pipeline.save_threshold

        anno_dict = get_pred_boundary(pred_dict, thres)
        scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict)
        if self.cfg.pipeline.save_split_scene:
            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
            print(f'Split scene video saved to {re_dir}')
        return len(scene_list), scene_dict

    def preprocess(self, inputs):
        logger.info('Begin shot detect......')
        shot_keyf_lst, anno, shot2keyf = shot_detect(
            inputs, **self.cfg.preprocessor.shot_detect)
        logger.info('Shot detect done!')

        single_shot_feat, sid = [], []
        for idx, one_shot in enumerate(shot_keyf_lst):
            one_shot = [
                self.test_transform(one_frame) for one_frame in one_shot
            ]
            one_shot = torch.stack(one_shot, dim=0)
            single_shot_feat.append(one_shot)
            sid.append(idx)
        single_shot_feat = torch.stack(single_shot_feat, dim=0)
        shot_feat = []
        for idx, one_shot in enumerate(anno):
            shot_idx = int(one_shot['shot_id']) + np.arange(
                -self.neighbor_size, self.neighbor_size + 1)
            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
            _one_shot = single_shot_feat[shot_idx]
            shot_feat.append(_one_shot)
        self.shot2keyf = shot2keyf
        self.anno = anno
        return shot_feat, sid
--- a/modelscope/models/cv/movie_scene_segmentation/utils/init.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/init.py
@@ -0,0 +1,3 @@
 from .save_op import get_pred_boundary, pred2scene, scene2video
 from .shot_encoder import resnet50
 from .trn import TransformerCRN
--- a/modelscope/models/cv/movie_scene_segmentation/utils/head.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/head.py
@@ -0,0 +1,29 @@
 # ------------------------------------------------------------------------------------
 # BaSSL
 # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 # Github: https://github.com/kakaobrain/bassl
 # ------------------------------------------------------------------------------------

 import torch.nn as nn
 import torch.nn.functional as F


 class MlpHead(nn.Module):

    def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128):
        super().__init__()
        self.output_dim = output_dim
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.model = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim, bias=True),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.output_dim, bias=True),
        )

    def forward(self, x):
        # x shape: [b t d] where t means the number of views
        x = self.model(x)
        return F.normalize(x, dim=-1)
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -0,0 +1,118 @@
 # ----------------------------------------------------------------------------------
 # The codes below partially refer to the SceneSeg LGSS.
 # Github: https://github.com/AnyiRao/SceneSeg
 # ----------------------------------------------------------------------------------
 import os
 import os.path as osp
 import subprocess

 import cv2
 import numpy as np
 from tqdm import tqdm


 def get_pred_boundary(pred_dict, threshold=0.5):
    pred = pred_dict['pred']
    tmp = (pred > threshold).astype(np.int32)
    anno_dict = {}
    for idx in range(len(tmp)):
        anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
    return anno_dict


 def pred2scene(shot2keyf, anno_dict):
    scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)

    scene_dict = {}
    assert len(scene_list) == len(pair_list)
    for scene_ind, scene_item in enumerate(scene_list):
        scene_dict.update(
            {scene_ind: {
                'shot': pair_list[scene_ind],
                'frame': scene_item
            }})

    return scene_dict, scene_list


 def scene2video(source_movie_fn, scene_list, thres):

    vcap = cv2.VideoCapture(source_movie_fn)
    fps = vcap.get(cv2.CAP_PROP_FPS)  # video.fps
    out_video_dir_fn = os.path.join(os.getcwd(),
                                    f'pred_result/scene_video_{thres}')
    os.makedirs(out_video_dir_fn, exist_ok=True)

    for scene_ind, scene_item in tqdm(enumerate(scene_list)):
        scene = str(scene_ind).zfill(4)
        start_frame = int(scene_item[0])
        end_frame = int(scene_item[1])
        start_time, end_time = start_frame / fps, end_frame / fps
        duration_time = end_time - start_time
        out_video_fn = os.path.join(out_video_dir_fn,
                                    'scene_{}.mp4'.format(scene))
        if os.path.exists(out_video_fn):
            continue
        call_list = ['ffmpeg']
        call_list += ['-v', 'quiet']
        call_list += [
            '-y', '-ss',
            str(start_time), '-t',
            str(duration_time), '-i', source_movie_fn
        ]
        call_list += ['-map_chapters', '-1']
        call_list += [out_video_fn]
        subprocess.call(call_list)
    return osp.join(os.getcwd(), 'pred_result')


 def get_demo_scene_list(shot2keyf, anno_dict):
    pair_list = get_pair_list(anno_dict)

    scene_list = []
    for pair in pair_list:
        start_shot, end_shot = int(pair[0]), int(pair[-1])
        start_frame = shot2keyf[start_shot].split(' ')[0]
        end_frame = shot2keyf[end_shot].split(' ')[1]
        scene_list.append((start_frame, end_frame))
    return scene_list, pair_list


 def get_pair_list(anno_dict):
    sort_anno_dict_key = sorted(anno_dict.keys())
    tmp = 0
    tmp_list = []
    tmp_label_list = []
    anno_list = []
    anno_label_list = []
    for key in sort_anno_dict_key:
        value = anno_dict.get(key)
        tmp += value
        tmp_list.append(key)
        tmp_label_list.append(value)
        if tmp == 1:
            anno_list.append(tmp_list)
            anno_label_list.append(tmp_label_list)
            tmp = 0
            tmp_list = []
            tmp_label_list = []
            continue
        if key == sort_anno_dict_key[-1]:
            if len(tmp_list) > 0:
                anno_list.append(tmp_list)
                anno_label_list.append(tmp_label_list)
    if len(anno_list) == 0:
        return None
    while [] in anno_list:
        anno_list.remove([])
    tmp_anno_list = [anno_list[0]]
    pair_list = []
    for ind in range(len(anno_list) - 1):
        cont_count = int(anno_list[ind + 1][0]) - int(anno_list[ind][-1])
        if cont_count > 1:
            pair_list.extend(tmp_anno_list)
            tmp_anno_list = [anno_list[ind + 1]]
            continue
        tmp_anno_list.append(anno_list[ind + 1])
    pair_list.extend(tmp_anno_list)
    return pair_list
--- a/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
@@ -0,0 +1,331 @@
 """
 Modified from original implementation in torchvision
 """

 from typing import Any, Callable, List, Optional, Type, Union

 import torch
 import torch.nn as nn
 from torch import Tensor


 def conv3x3(in_planes: int,
            out_planes: int,
            stride: int = 1,
            groups: int = 1,
            dilation: int = 1) -> nn.Conv2d:
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
        bias=False,
        dilation=dilation,
    )


 def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
    """1x1 convolution"""
    return nn.Conv2d(
        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


 class BasicBlock(nn.Module):
    expansion: int = 1

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError(
                'BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError(
                'Dilation > 1 not supported in BasicBlock')
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


 class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

    expansion: int = 4

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.0)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


 class ResNet(nn.Module):

    def __init__(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        layers: List[int],
        in_channel_dim: int = 3,
        zero_init_residual: bool = False,
        use_last_block_grid: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.use_last_block_grid = use_last_block_grid
        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError('replace_stride_with_dilation should be None '
                             'or a 3-element tuple, got {}'.format(
                                 replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(
            in_channel_dim,
            self.inplanes,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False,
        )
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(
            block,
            128,
            layers[1],
            stride=2,
            dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(
            block,
            256,
            layers[2],
            stride=2,
            dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(
            block,
            512,
            layers[3],
            stride=2,
            dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight,
                                      0)  # type: ignore[arg-type]
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight,
                                      0)  # type: ignore[arg-type]

    def _make_layer(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        planes: int,
        blocks: int,
        stride: int = 1,
        dilate: bool = False,
    ) -> nn.Sequential:
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(
            block(
                self.inplanes,
                planes,
                stride,
                downsample,
                self.groups,
                self.base_width,
                previous_dilation,
                norm_layer,
            ))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    groups=self.groups,
                    base_width=self.base_width,
                    dilation=self.dilation,
                    norm_layer=norm_layer,
                ))

        return nn.Sequential(*layers)

    def _forward_impl(self, x: Tensor, grid: bool, level: List, both: bool,
                      grid_only: bool) -> Tensor:
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        if grid:
            x_grid = []

        if 3 in level:
            x_grid.append(x.detach().clone())
            if not both and len(level) == 1:
                return x_grid

        x = self.layer4(x)

        if 4 in level:
            x_grid.append(x.detach().clone())
            if not both and len(level) == 1:
                return x_grid

        x = self.avgpool(x)
        x = torch.flatten(x, 1)

        if not grid or len(level) == 0:
            return x

        if grid_only:
            return x_grid

        if both:
            return x, x_grid

        return x

    def forward(
        self,
        x: Tensor,
        grid: bool = False,
        level: List = [],
        both: bool = False,
        grid_only: bool = False,
    ) -> Tensor:
        return self._forward_impl(x, grid, level, both, grid_only)


 def resnet50(**kwargs: Any) -> ResNet:
    r"""ResNet-50 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    """
    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
--- a/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/trn.py
@@ -0,0 +1,132 @@
 # ------------------------------------------------------------------------------------
 # BaSSL
 # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
 # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 # Github: https://github.com/kakaobrain/bassl
 # ------------------------------------------------------------------------------------

 import torch
 import torch.nn as nn
 from transformers.models.bert.modeling_bert import BertEncoder


 class ShotEmbedding(nn.Module):

    def __init__(self, cfg):
        super().__init__()

        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
        self.shot_embedding = nn.Linear(cfg.input_dim, cfg.hidden_size)
        self.position_embedding = nn.Embedding(nn_size, cfg.hidden_size)
        self.mask_embedding = nn.Embedding(2, cfg.input_dim, padding_idx=0)

        # tf naming convention for layer norm
        self.LayerNorm = nn.LayerNorm(cfg.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(cfg.hidden_dropout_prob)

        self.register_buffer('pos_ids',
                             torch.arange(nn_size, dtype=torch.long))

    def forward(
        self,
        shot_emb: torch.Tensor,
        mask: torch.Tensor = None,
        pos_ids: torch.Tensor = None,
    ) -> torch.Tensor:

        assert len(shot_emb.size()) == 3

        if pos_ids is None:
            pos_ids = self.pos_ids

        # this for mask embedding (un-masked ones remain unchanged)
        if mask is not None:
            self.mask_embedding.weight.data[0, :].fill_(0)
            mask_emb = self.mask_embedding(mask.long())
            shot_emb = (shot_emb * (1 - mask).float()[:, :, None]) + mask_emb

        # we set [CLS] token to averaged feature
        cls_emb = shot_emb.mean(dim=1)

        # embedding shots
        shot_emb = torch.cat([cls_emb[:, None, :], shot_emb], dim=1)
        shot_emb = self.shot_embedding(shot_emb)
        pos_emb = self.position_embedding(pos_ids)
        embeddings = shot_emb + pos_emb[None, :]
        embeddings = self.dropout(self.LayerNorm(embeddings))
        return embeddings


 class TransformerCRN(nn.Module):

    def __init__(self, cfg):
        super().__init__()

        self.pooling_method = cfg.pooling_method
        self.shot_embedding = ShotEmbedding(cfg)
        self.encoder = BertEncoder(cfg)

        nn_size = cfg.neighbor_size + 2  # +1 for center shot, +1 for cls
        self.register_buffer(
            'attention_mask',
            self._get_extended_attention_mask(
                torch.ones((1, nn_size)).float()),
        )

    def forward(
        self,
        shot: torch.Tensor,
        mask: torch.Tensor = None,
        pos_ids: torch.Tensor = None,
        pooling_method: str = None,
    ):
        if self.attention_mask.shape[1] != (shot.shape[1] + 1):
            n_shot = shot.shape[1] + 1  # +1 for CLS token
            attention_mask = self._get_extended_attention_mask(
                torch.ones((1, n_shot), dtype=torch.float, device=shot.device))
        else:
            attention_mask = self.attention_mask

        shot_emb = self.shot_embedding(shot, mask=mask, pos_ids=pos_ids)
        encoded_emb = self.encoder(
            shot_emb, attention_mask=attention_mask).last_hidden_state

        return encoded_emb, self.pooler(
            encoded_emb, pooling_method=pooling_method)

    def pooler(self, sequence_output, pooling_method=None):
        if pooling_method is None:
            pooling_method = self.pooling_method

        if pooling_method == 'cls':
            return sequence_output[:, 0, :]
        elif pooling_method == 'avg':
            return sequence_output[:, 1:].mean(dim=1)
        elif pooling_method == 'max':
            return sequence_output[:, 1:].max(dim=1)[0]
        elif pooling_method == 'center':
            cidx = sequence_output.shape[1] // 2
            return sequence_output[:, cidx, :]
        else:
            raise ValueError

    def _get_extended_attention_mask(self, attention_mask):

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f'Wrong shape for attention_mask (shape {attention_mask.shape})'
            )

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask
--- a/modelscope/models/cv/object_detection/init.py
+++ b/modelscope/models/cv/object_detection/init.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .mmdet_model import DetectionModel
    from .yolox_pai import YOLOX

 else:
    _import_structure = {
        'mmdet_model': ['DetectionModel'],
        'yolox_pai': ['YOLOX']
    }

    import sys
--- a/modelscope/models/cv/object_detection/yolox_pai.py
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.detection.detectors import YOLOX as _YOLOX

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks


@MODELS.register_module(
    group_key=Tasks.image_object_detection, module_name=Models.yolox)
 class YOLOX(EasyCVBaseModel, _YOLOX):

    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        _YOLOX.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/product_retrieval_embedding/item_model.py
+++ b/modelscope/models/cv/product_retrieval_embedding/item_model.py
@@ -13,8 +13,8 @@ from modelscope.models.cv.product_retrieval_embedding.item_embedding import (
    preprocess, resnet50_embed)
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import create_device
 from modelscope.utils.logger import get_logger
 from modelscope.utils.torch_utils import create_device

 logger = get_logger()

@@ -48,9 +48,8 @@ class ProductRetrievalEmbedding(TorchModel):
            filter_param(src_params, own_state)
            model.load_state_dict(own_state)

        cpu_flag = device == 'cpu'
        self.device = create_device(
            cpu_flag)  # device.type == "cpu" or device.type == "cuda"
            device)  # device.type == "cpu" or device.type == "cuda"
        self.use_gpu = self.device.type == 'cuda'

        # config the model path
--- a/modelscope/models/cv/realtime_object_detection/init.py
+++ b/modelscope/models/cv/realtime_object_detection/init.py
@@ -0,0 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .realtime_detector import RealtimeDetector
 else:
    _import_structure = {
        'realtime_detector': ['RealtimeDetector'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/realtime_object_detection/realtime_detector.py
+++ b/modelscope/models/cv/realtime_object_detection/realtime_detector.py
@@ -0,0 +1,85 @@
 import argparse
 import logging as logger
 import os
 import os.path as osp
 import time

 import cv2
 import json
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .yolox.data.data_augment import ValTransform
 from .yolox.exp import get_exp_by_name
 from .yolox.utils import postprocess


@MODELS.register_module(
    group_key=Tasks.image_object_detection,
    module_name=Models.realtime_object_detection)
 class RealtimeDetector(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        self.config = Config.from_file(
            os.path.join(self.model_dir, ModelFile.CONFIGURATION))

        # model type
        self.exp = get_exp_by_name(self.config.model_type)

        # build model
        self.model = self.exp.get_model()
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
        ckpt = torch.load(model_path, map_location='cpu')

        # load the model state dict
        self.model.load_state_dict(ckpt['model'])
        self.model.eval()

        # params setting
        self.exp.num_classes = self.config.num_classes
        self.confthre = self.config.conf_thr
        self.num_classes = self.exp.num_classes
        self.nmsthre = self.exp.nmsthre
        self.test_size = self.exp.test_size
        self.preproc = ValTransform(legacy=False)

    def inference(self, img):
        with torch.no_grad():
            outputs = self.model(img)
        return outputs

    def forward(self, inputs):
        return self.inference(inputs)

    def preprocess(self, img):
        img = LoadImage.convert_to_ndarray(img)
        height, width = img.shape[:2]
        self.ratio = min(self.test_size[0] / img.shape[0],
                         self.test_size[1] / img.shape[1])

        img, _ = self.preproc(img, None, self.test_size)
        img = torch.from_numpy(img).unsqueeze(0)
        img = img.float()

        return img

    def postprocess(self, input):
        outputs = postprocess(
            input,
            self.num_classes,
            self.confthre,
            self.nmsthre,
            class_agnostic=True)

        if len(outputs) == 1:
            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
            scores = outputs[0][:, 5].cpu().numpy()
            labels = outputs[0][:, 6].cpu().int().numpy()

        return bboxes, scores, labels
--- a/modelscope/models/cv/realtime_object_detection/yolox/init.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/init.py
--- a/modelscope/models/cv/realtime_object_detection/yolox/data/init.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/data/init.py
--- a/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
@@ -0,0 +1,69 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
 """
 Data augmentation functionality. Passed as callable transformations to
 Dataset classes.

 The data augmentation procedures were interpreted from @weiliu89's SSD paper
 http://arxiv.org/abs/1512.02325
 """

 import math
 import random

 import cv2
 import numpy as np

 from ..utils import xyxy2cxcywh


 def preproc(img, input_size, swap=(2, 0, 1)):
    if len(img.shape) == 3:
        padded_img = np.ones(
            (input_size[0], input_size[1], 3), dtype=np.uint8) * 114
    else:
        padded_img = np.ones(input_size, dtype=np.uint8) * 114

    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
    resized_img = cv2.resize(
        img,
        (int(img.shape[1] * r), int(img.shape[0] * r)),
        interpolation=cv2.INTER_LINEAR,
    ).astype(np.uint8)
    padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img

    padded_img = padded_img.transpose(swap)
    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
    return padded_img, r


 class ValTransform:
    """
    Defines the transformations that should be applied to test PIL image
    for input into the network

    dimension -> tensorize -> color adj

    Arguments:
        resize (int): input dimension to SSD
        rgb_means ((int,int,int)): average RGB of the dataset
            (104,117,123)
        swap ((int,int,int)): final order of channels

    Returns:
        transform (transform) : callable transform to be applied to test/val
        data
    """

    def __init__(self, swap=(2, 0, 1), legacy=False):
        self.swap = swap
        self.legacy = legacy

    # assume input is cv2 img for now
    def __call__(self, img, res, input_size):
        img, _ = preproc(img, input_size, self.swap)
        if self.legacy:
            img = img[::-1, :, :].copy()
            img /= 255.0
            img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
            img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
        return img, np.zeros((1, 5))
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/init.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/init.py
@@ -0,0 +1,5 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

 from .base_exp import BaseExp
 from .build import get_exp_by_name
 from .yolox_base import Exp
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
@@ -0,0 +1,12 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

 from abc import ABCMeta, abstractmethod

 from torch.nn import Module


 class BaseExp(metaclass=ABCMeta):

    @abstractmethod
    def get_model(self) -> Module:
        pass
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -0,0 +1,18 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

 import os
 import sys


 def get_exp_by_name(exp_name):
    exp = exp_name.replace('-',
                           '_')  # convert string like "yolox-s" to "yolox_s"
    if exp == 'yolox_s':
        from .default import YoloXSExp as YoloXExp
    elif exp == 'yolox_nano':
        from .default import YoloXNanoExp as YoloXExp
    elif exp == 'yolox_tiny':
        from .default import YoloXTinyExp as YoloXExp
    else:
        pass
    return YoloXExp()