Browse Source

Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts:
#	modelscope/preprocessors/multi_modal.py
master
行嗔 3 years ago
parent
commit
c546f2a8b9
100 changed files with 6490 additions and 82 deletions
  1. +169
    -0
      .dev_scripts/build_image.sh
  2. +10
    -1
      .dev_scripts/ci_container_test.sh
  3. +5
    -2
      .dev_scripts/dockerci.sh
  4. +11
    -0
      .dockerignore
  5. +2
    -0
      .gitattributes
  6. +1
    -1
      configs/cv/configuration.json
  7. +3
    -0
      data/test/images/image-text-retrieval.jpg
  8. +3
    -0
      data/test/images/image_panoptic_segmentation.jpg
  9. +3
    -0
      data/test/images/image_reid_person.jpg
  10. +3
    -0
      data/test/images/image_segmentation.jpg
  11. +3
    -0
      data/test/images/image_semantic_segmentation.jpg
  12. +3
    -0
      data/test/regression/fill_mask_bert_zh.bin
  13. +3
    -0
      data/test/regression/fill_mask_sbert_en.bin
  14. +3
    -0
      data/test/regression/fill_mask_sbert_zh.bin
  15. +3
    -0
      data/test/regression/fill_mask_veco_en.bin
  16. +3
    -0
      data/test/regression/fill_mask_veco_zh.bin
  17. +3
    -0
      data/test/regression/sbert_nli.bin
  18. +3
    -0
      data/test/regression/sbert_sen_sim.bin
  19. +3
    -0
      data/test/regression/sbert_ws_en.bin
  20. +3
    -0
      data/test/regression/sbert_ws_zh.bin
  21. +3
    -0
      data/test/regression/sbert_zero_shot.bin
  22. +3
    -0
      data/test/videos/Walking.54138969.mp4
  23. +3
    -0
      data/test/videos/movie_scene_segmentation_test_video.mp4
  24. +84
    -0
      docker/Dockerfile.ubuntu
  25. +15
    -0
      docker/rcfiles/conda.tuna
  26. +13
    -0
      docker/rcfiles/ubuntu20.04_sources.tuna
  27. +1
    -1
      docs/source/quick_start.md
  28. +1
    -1
      modelscope/fileio/__init__.py
  29. +1
    -1
      modelscope/fileio/file.py
  30. +6
    -5
      modelscope/fileio/format/json.py
  31. +29
    -5
      modelscope/hub/api.py
  32. +3
    -1
      modelscope/hub/constants.py
  33. +2
    -2
      modelscope/hub/errors.py
  34. +111
    -9
      modelscope/hub/repository.py
  35. +13
    -4
      modelscope/hub/utils/utils.py
  36. +56
    -0
      modelscope/metainfo.py
  37. +6
    -0
      modelscope/metrics/__init__.py
  38. +38
    -0
      modelscope/metrics/audio_noise_metric.py
  39. +14
    -4
      modelscope/metrics/builder.py
  40. +52
    -0
      modelscope/metrics/movie_scene_segmentation_metric.py
  41. +78
    -0
      modelscope/metrics/video_summarization_metric.py
  42. +32
    -21
      modelscope/models/audio/ans/frcrn.py
  43. +1
    -0
      modelscope/models/audio/kws/farfield/model.py
  44. +40
    -9
      modelscope/models/base/base_model.py
  45. +15
    -7
      modelscope/models/cv/__init__.py
  46. +43
    -2
      modelscope/models/cv/action_recognition/models.py
  47. +301
    -0
      modelscope/models/cv/action_recognition/s3dg.py
  48. +23
    -0
      modelscope/models/cv/body_3d_keypoints/__init__.py
  49. +246
    -0
      modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
  50. +233
    -0
      modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
  51. +2
    -2
      modelscope/models/cv/crowd_counting/cc_model.py
  52. +25
    -0
      modelscope/models/cv/easycv_base.py
  53. +1
    -1
      modelscope/models/cv/image_classification/mmcls_model.py
  54. +22
    -0
      modelscope/models/cv/image_panoptic_segmentation/__init__.py
  55. +54
    -0
      modelscope/models/cv/image_panoptic_segmentation/panseg_model.py
  56. +22
    -0
      modelscope/models/cv/image_reid_person/__init__.py
  57. +136
    -0
      modelscope/models/cv/image_reid_person/pass_model.py
  58. +418
    -0
      modelscope/models/cv/image_reid_person/transreid_model.py
  59. +24
    -0
      modelscope/models/cv/image_semantic_segmentation/__init__.py
  60. +1
    -0
      modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py
  61. +47
    -0
      modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py
  62. +57
    -0
      modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py
  63. +16
    -0
      modelscope/models/cv/image_semantic_segmentation/segformer.py
  64. +76
    -0
      modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
  65. +3
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py
  66. +3
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py
  67. +4
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py
  68. +523
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py
  69. +3
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py
  70. +476
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py
  71. +169
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py
  72. +3
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py
  73. +267
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py
  74. +581
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py
  75. +3
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py
  76. +314
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py
  77. +303
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py
  78. +7
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py
  79. +11
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py
  80. +60
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py
  81. +48
    -0
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
  82. +25
    -0
      modelscope/models/cv/movie_scene_segmentation/__init__.py
  83. +45
    -0
      modelscope/models/cv/movie_scene_segmentation/get_model.py
  84. +192
    -0
      modelscope/models/cv/movie_scene_segmentation/model.py
  85. +3
    -0
      modelscope/models/cv/movie_scene_segmentation/utils/__init__.py
  86. +29
    -0
      modelscope/models/cv/movie_scene_segmentation/utils/head.py
  87. +118
    -0
      modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
  88. +331
    -0
      modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py
  89. +132
    -0
      modelscope/models/cv/movie_scene_segmentation/utils/trn.py
  90. +2
    -0
      modelscope/models/cv/object_detection/__init__.py
  91. +16
    -0
      modelscope/models/cv/object_detection/yolox_pai.py
  92. +2
    -3
      modelscope/models/cv/product_retrieval_embedding/item_model.py
  93. +21
    -0
      modelscope/models/cv/realtime_object_detection/__init__.py
  94. +85
    -0
      modelscope/models/cv/realtime_object_detection/realtime_detector.py
  95. +0
    -0
      modelscope/models/cv/realtime_object_detection/yolox/__init__.py
  96. +0
    -0
      modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py
  97. +69
    -0
      modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py
  98. +5
    -0
      modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py
  99. +12
    -0
      modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py
  100. +18
    -0
      modelscope/models/cv/realtime_object_detection/yolox/exp/build.py

+ 169
- 0
.dev_scripts/build_image.sh View File

@@ -0,0 +1,169 @@
#!/bin/bash
# default values.
BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04
BASE_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope
python_version=3.7.13
torch_version=1.11.0
cudatoolkit_version=11.3
tensorflow_version=1.15.5
modelscope_version=None
is_ci_test=False
is_dsw=False
is_cpu=False
run_ci_test=False
function usage(){
echo "usage: build.sh "
echo " --python=python_version set python version, default: $python_version"
echo " --torch=torch_version set pytorch version, fefault: $torch_version"
echo " --cudatoolkit=cudatoolkit_version set cudatoolkit version used for pytorch, default: $cudatoolkit_version"
echo " --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version"
echo " --modelscope=modelscope_version set modelscope version, default: $modelscope_version"
echo " --test option for run test before push image, only push on ci test pass"
echo " --cpu option for build cpu version"
echo " --dsw option for build dsw version"
echo " --ci option for build ci version"
echo " --push option for push image to remote repo"
}
for i in "$@"; do
case $i in
--python=*)
python_version="${i#*=}"
shift
;;
--torch=*)
torch_version="${i#*=}"
shift # pytorch version
;;
--tensorflow=*)
tensorflow_version="${i#*=}"
shift # tensorflow version
;;
--cudatoolkit=*)
cudatoolkit_version="${i#*=}"
shift # cudatoolkit for pytorch
;;
--modelscope=*)
modelscope_version="${i#*=}"
shift # cudatoolkit for pytorch
;;
--test)
run_ci_test=True
shift # will run ci test
;;
--cpu)
is_cpu=True
shift # is cpu image
;;
--ci)
is_ci_test=True
shift # is ci, will not install modelscope
;;
--dsw)
is_dsw=True
shift # is dsw, will set dsw cache location
;;
--push)
is_push=True
shift # is dsw, will set dsw cache location
;;
--help)
usage
exit 0
;;
-*|--*)
echo "Unknown option $i"
usage
exit 1
;;
*)
;;
esac
done

if [ "$modelscope_version" == "None" ]; then
echo "ModelScope version must specify!"
exit 1
fi
if [ "$is_cpu" == "True" ]; then
export BASE_IMAGE=$BASE_CPU_IMAGE
base_tag=ubuntu20.04
export USE_GPU=False
else
export BASE_IMAGE=$BASE_GPU_IMAGE
base_tag=ubuntu20.04-cuda11.3.0
export USE_GPU=True
fi
if [[ $python_version == 3.7* ]]; then
base_tag=$base_tag-py37
elif [[ $python_version == z* ]]; then
base_tag=$base_tag-py38
elif [[ $python_version == z* ]]; then
base_tag=$base_tag-py39
else
echo "Unsupport python version: $python_version"
exit 1
fi

target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version
if [ "$is_ci_test" == "True" ]; then
target_image_tag=$target_image_tag-$modelscope_version-ci
else
target_image_tag=$target_image_tag-$modelscope_version-test
fi
export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag
export PYTHON_VERSION=$python_version
export TORCH_VERSION=$torch_version
export CUDATOOLKIT_VERSION=$cudatoolkit_version
export TENSORFLOW_VERSION=$tensorflow_version
echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n"
docker_file_content=`cat docker/Dockerfile.ubuntu`
if [ "$is_ci_test" != "True" ]; then
echo "Building ModelScope lib, will install ModelScope lib to image"
docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir modelscope==$modelscope_version -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html"
fi
echo "$is_dsw"
if [ "$is_dsw" == "False" ]; then
echo "Not DSW image"
else
echo "Building dsw image well need set ModelScope lib cache location."
docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope"
fi
printf "$docker_file_content" > Dockerfile
docker build -t $IMAGE_TO_BUILD \
--build-arg USE_GPU \
--build-arg BASE_IMAGE \
--build-arg PYTHON_VERSION \
--build-arg TORCH_VERSION \
--build-arg CUDATOOLKIT_VERSION \
--build-arg TENSORFLOW_VERSION \
-f Dockerfile .

if [ $? -ne 0 ]; then
echo "Running docker build command error, please check the log!"
exit -1
fi
if [ "$run_ci_test" == "True" ]; then
echo "Running ci case."
export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache
export MODELSCOPE_HOME_CACHE=/home/mulin.lyh/ci_case_home # for credential
export IMAGE_NAME=$MODELSCOPE_REPO_ADDRESS
export IMAGE_VERSION=$target_image_tag
export MODELSCOPE_DOMAIN=www.modelscope.cn
export HUB_DATASET_ENDPOINT=http://www.modelscope.cn
export CI_TEST=True
export TEST_LEVEL=1
if [ "$is_ci_test" != "True" ]; then
echo "Testing for dsw image or MaaS-lib image"
export CI_COMMAND="python tests/run.py"
fi
bash .dev_scripts/dockerci.sh
if [ $? -ne 0 ]; then
echo "Running unittest failed, please check the log!"
exit -1
fi
fi
if [ "$is_push" == "True" ]; then
echo "Pushing image: $IMAGE_TO_BUILD"
docker push $IMAGE_TO_BUILD
fi

+ 10
- 1
.dev_scripts/ci_container_test.sh View File

@@ -16,5 +16,14 @@ if [ $? -ne 0 ]; then
echo "linter test failed, please run 'pre-commit run --all-files' to check"
exit -1
fi
# test with install
python setup.py install

PYTHONPATH=. python tests/run.py
if [ $# -eq 0 ]; then
ci_command="python tests/run.py --subprocess"
else
ci_command="$@"
fi
echo "Running case with command: $ci_command"
$ci_command
#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py

+ 5
- 2
.dev_scripts/dockerci.sh View File

@@ -1,5 +1,4 @@
#!/bin/bash
IMAGE_NAME=reg.docker.alibaba-inc.com/dinger/modelscope
MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache
CODE_DIR=$PWD
CODE_DIR_IN_CONTAINER=/Maas-lib
@@ -8,6 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
cpu_sets_arr=($cpu_sets)
is_get_file_lock=false
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
echo "ci command: $CI_COMMAND"
for gpu in $gpus
do
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
@@ -31,10 +32,12 @@ do
-e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
-e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
-e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
-e TEST_LEVEL=$TEST_LEVEL \
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
--workdir=$CODE_DIR_IN_CONTAINER \
--net host \
${IMAGE_NAME}:${IMAGE_VERSION} \
bash .dev_scripts/ci_container_test.sh
$CI_COMMAND
if [ $? -ne 0 ]; then
echo "Running test case failed, please check the log!"
exit -1


+ 11
- 0
.dockerignore View File

@@ -0,0 +1,11 @@
.gitignore
tests
data
.dev_scripts
.dockerignore
.git
.gitattributes
.pre-commit-config.yaml
.pre-commit-config_local.yaml
.readthedocs.yaml
Dockfile

+ 2
- 0
.gitattributes View File

@@ -4,4 +4,6 @@
*.wav filter=lfs diff=lfs merge=lfs -text
*.JPEG filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.avi filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text

+ 1
- 1
configs/cv/configuration.json View File

@@ -2,7 +2,6 @@
"framework": "pytorch",

"task": "image_classification",
"work_dir": "./work_dir",

"model": {
"type": "classification",
@@ -119,6 +118,7 @@
},

"train": {
"work_dir": "./work_dir",
"dataloader": {
"batch_size_per_gpu": 2,
"workers_per_gpu": 1


+ 3
- 0
data/test/images/image-text-retrieval.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b012c7e966f6550874ccb85ef9602d483aa89b8623dff9ffcdb0faab8f2ca9ab
size 218143

+ 3
- 0
data/test/images/image_panoptic_segmentation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
size 245864

+ 3
- 0
data/test/images/image_reid_person.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4c9a7e42edc7065c16972ff56267aad63f5233e36aa5a699b84939f5bad73276
size 2451

+ 3
- 0
data/test/images/image_segmentation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
size 146140

+ 3
- 0
data/test/images/image_semantic_segmentation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a
size 245864

+ 3
- 0
data/test/regression/fill_mask_bert_zh.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:541183383bb06aa3ca2c44a68cd51c1be5e3e984a1dee2c58092b9552660f3ce
size 61883

+ 3
- 0
data/test/regression/fill_mask_sbert_en.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8f0afcd9d2aa5ac9569114203bd9db4f1a520c903a88fd4854370cdde0e7eab7
size 119940

+ 3
- 0
data/test/regression/fill_mask_sbert_zh.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
size 119940

+ 3
- 0
data/test/regression/fill_mask_veco_en.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
size 119619

+ 3
- 0
data/test/regression/fill_mask_veco_zh.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
size 119619

+ 3
- 0
data/test/regression/sbert_nli.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62
size 62231

+ 3
- 0
data/test/regression/sbert_sen_sim.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a
size 62235

+ 3
- 0
data/test/regression/sbert_ws_en.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
size 60801

+ 3
- 0
data/test/regression/sbert_ws_zh.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
size 60801

+ 3
- 0
data/test/regression/sbert_zero_shot.bin View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
size 61589

+ 3
- 0
data/test/videos/Walking.54138969.mp4 View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c
size 44217644

+ 3
- 0
data/test/videos/movie_scene_segmentation_test_video.mp4 View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
size 126815483

+ 84
- 0
docker/Dockerfile.ubuntu View File

@@ -0,0 +1,84 @@
ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
FROM $BASE_IMAGE
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Shanghai
ENV CONDA_DIR /opt/conda
ENV PATH="${CONDA_DIR}/bin:${PATH}"
ENV arch=x86_64
SHELL ["/bin/bash", "-c"]
COPY docker/rcfiles /tmp/resources
RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \
apt-get update && \
apt-get install -y locales wget git vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
dpkg -i ./git-lfs_3.2.0_amd64.deb && \
rm -f ./git-lfs_3.2.0_amd64.deb && \
locale-gen zh_CN && \
locale-gen zh_CN.utf8 && \
update-locale LANG=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 && \
ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
dpkg-reconfigure --frontend noninteractive tzdata && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8

#install and config python
ARG PYTHON_VERSION=3.7.13
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \
/bin/bash miniconda.sh -b -p /opt/conda && \
rm -f miniconda.sh && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
cp /tmp/resources/conda.tuna ~/.condarc && \
source /root/.bashrc && \
conda install --yes python==${PYTHON_VERSION} && \
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

ARG USE_GPU=True

# install pytorch
ARG TORCH_VERSION=1.12.0
ARG CUDATOOLKIT_VERSION=11.3
RUN if [ "$USE_GPU" = "True" ] ; then \
conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \
else \
conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \
fi

# install tensorflow
ARG TENSORFLOW_VERSION=1.15.5
RUN if [ "$USE_GPU" = "True" ] ; then \
pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
else \
pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
fi

RUN if [ "$USE_GPU" = "True" ] ; then \
CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir mmcv-full && pip cache purge; \
else \
MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir mmcv-full && pip cache purge; \
fi

# install modelscope
COPY requirements /var/modelscope
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
pip cache purge

# default shell bash
ENV SHELL=/bin/bash

# install special package
RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0

RUN if [ "$USE_GPU" = "True" ] ; then \
pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
else \
pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
fi

+ 15
- 0
docker/rcfiles/conda.tuna View File

@@ -0,0 +1,15 @@
channels:
- defaults
show_channel_urls: true
default_channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
custom_channels:
conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud

+ 13
- 0
docker/rcfiles/ubuntu20.04_sources.tuna View File

@@ -0,0 +1,13 @@
# 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse

# 预发布软件源,不建议启用
# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse

+ 1
- 1
docs/source/quick_start.md View File

@@ -108,7 +108,7 @@ pip install -e ".[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releas
```shell
pip install -e ".[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
```
###
### 安装验证

安装成功后,可以执行如下命令进行验证安装是否正确:


+ 1
- 1
modelscope/fileio/__init__.py View File

@@ -1,2 +1,2 @@
from .file import File
from .file import File, LocalStorage
from .io import dump, dumps, load

+ 1
- 1
modelscope/fileio/file.py View File

@@ -240,7 +240,7 @@ class File(object):
@staticmethod
def _get_storage(uri):
assert isinstance(uri,
str), f'uri should be str type, buf got {type(uri)}'
str), f'uri should be str type, but got {type(uri)}'

if '://' not in uri:
# local path


+ 6
- 5
modelscope/fileio/format/json.py View File

@@ -1,5 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import json
import numpy as np

from .base import FormatHandler
@@ -22,14 +21,16 @@ def set_default(obj):


class JsonHandler(FormatHandler):
"""Use jsonplus, serialization of Python types to JSON that "just works"."""

def load(self, file):
return json.load(file)
import jsonplus
return jsonplus.loads(file.read())

def dump(self, obj, file, **kwargs):
kwargs.setdefault('default', set_default)
json.dump(obj, file, **kwargs)
file.write(self.dumps(obj, **kwargs))

def dumps(self, obj, **kwargs):
import jsonplus
kwargs.setdefault('default', set_default)
return json.dumps(obj, **kwargs)
return jsonplus.dumps(obj, **kwargs)

+ 29
- 5
modelscope/hub/api.py View File

@@ -1,7 +1,6 @@
import os
import pickle
import shutil
import subprocess
from collections import defaultdict
from http import HTTPStatus
from http.cookiejar import CookieJar
@@ -16,8 +15,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
API_RESPONSE_FIELD_MESSAGE,
API_RESPONSE_FIELD_USERNAME,
DEFAULT_CREDENTIALS_PATH)
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
HUB_DATASET_ENDPOINT)
from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DEFAULT_MODEL_REVISION,
DatasetFormations, DatasetMetaFormats,
@@ -26,7 +24,8 @@ from modelscope.utils.logger import get_logger
from .errors import (InvalidParameter, NotExistError, RequestError,
datahub_raise_on_error, handle_http_response, is_ok,
raise_on_error)
from .utils.utils import get_endpoint, model_id_to_group_owner_name
from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
model_id_to_group_owner_name)

logger = get_logger()

@@ -35,7 +34,8 @@ class HubApi:

def __init__(self, endpoint=None, dataset_endpoint=None):
self.endpoint = endpoint if endpoint is not None else get_endpoint()
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
)

def login(
self,
@@ -376,6 +376,27 @@ class HubApi:
f'ststoken?Revision={revision}'
return self.datahub_remote_call(datahub_url)

def get_dataset_access_config_session(
self,
cookies: CookieJar,
dataset_name: str,
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):

datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
f'ststoken?Revision={revision}'

cookies = requests.utils.dict_from_cookiejar(cookies)
r = requests.get(url=datahub_url, cookies=cookies)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
return resp['Data']

def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
r = requests.post(url)
r.raise_for_status()

@staticmethod
def datahub_remote_call(url):
r = requests.get(url)
@@ -383,6 +404,9 @@ class HubApi:
datahub_raise_on_error(url, resp)
return resp['Data']

def check_cookies_upload_data(self, use_cookies) -> CookieJar:
return self._check_cookie(use_cookies=use_cookies)


class ModelScopeConfig:
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)


+ 3
- 1
modelscope/hub/constants.py View File

@@ -1,3 +1,5 @@
from pathlib import Path

MODELSCOPE_URL_SCHEME = 'http://'
DEFAULT_MODELSCOPE_DOMAIN = 'www.modelscope.cn'
DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_DOMAIN
@@ -6,7 +8,7 @@ DEFAULT_MODELSCOPE_GROUP = 'damo'
MODEL_ID_SEPARATOR = '/'
FILE_HASH = 'Sha256'
LOGGER_NAME = 'ModelScopeHub'
DEFAULT_CREDENTIALS_PATH = '~/.modelscope/credentials'
DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials')
API_RESPONSE_FIELD_DATA = 'Data'
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
API_RESPONSE_FIELD_USERNAME = 'Username'


+ 2
- 2
modelscope/hub/errors.py View File

@@ -49,8 +49,8 @@ def handle_http_response(response, logger, cookies, model_id):
except HTTPError:
if cookies is None: # code in [403] and
logger.error(
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be private. \
Please login first.')
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
private. Please login first.')
raise




+ 111
- 9
modelscope/hub/repository.py View File

@@ -2,7 +2,8 @@ import os
from typing import Optional

from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
from modelscope.utils.constant import DEFAULT_MODEL_REVISION
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DEFAULT_MODEL_REVISION)
from modelscope.utils.logger import get_logger
from .api import ModelScopeConfig
from .git import GitCommandWrapper
@@ -15,14 +16,12 @@ class Repository:
"""A local representation of the model git repository.
"""

def __init__(
self,
model_dir: str,
clone_from: str,
revision: Optional[str] = DEFAULT_MODEL_REVISION,
auth_token: Optional[str] = None,
git_path: Optional[str] = None,
):
def __init__(self,
model_dir: str,
clone_from: str,
revision: Optional[str] = DEFAULT_MODEL_REVISION,
auth_token: Optional[str] = None,
git_path: Optional[str] = None):
"""
Instantiate a Repository object by cloning the remote ModelScopeHub repo
Args:
@@ -86,6 +85,7 @@ class Repository:
branch: Optional[str] = DEFAULT_MODEL_REVISION,
force: bool = False):
"""Push local files to remote, this method will do.
git pull
git add
git commit
git push
@@ -117,3 +117,105 @@ class Repository:
url=url,
local_branch=branch,
remote_branch=branch)


class DatasetRepository:
"""A local representation of the dataset (metadata) git repository.
"""

def __init__(self,
repo_work_dir: str,
dataset_id: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION,
auth_token: Optional[str] = None,
git_path: Optional[str] = None):
"""
Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo
Args:
repo_work_dir(`str`):
The dataset repo root directory.
dataset_id:
dataset id in ModelScope from which git clone
revision(`Optional[str]`):
revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash
auth_token(`Optional[str]`):
token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
as the token is already saved when you login the first time, if None, we will use saved token.
git_path:(`Optional[str]`):
The git command line path, if None, we use 'git'
"""
self.dataset_id = dataset_id
self.repo_work_dir = repo_work_dir
self.repo_base_dir = os.path.dirname(repo_work_dir)
self.repo_name = os.path.basename(repo_work_dir)
self.revision = revision
if auth_token:
self.auth_token = auth_token
else:
self.auth_token = ModelScopeConfig.get_token()

self.git_wrapper = GitCommandWrapper(git_path)
os.makedirs(self.repo_work_dir, exist_ok=True)
self.repo_url = self._get_repo_url(dataset_id=dataset_id)

def clone(self) -> str:
# check local repo dir, directory not empty.
if os.listdir(self.repo_work_dir):
remote_url = self._get_remote_url()
remote_url = self.git_wrapper.remove_token_from_url(remote_url)
# no need clone again
if remote_url and remote_url == self.repo_url:
return ''

logger.info('Cloning repo from {} '.format(self.repo_url))
self.git_wrapper.clone(self.repo_base_dir, self.auth_token,
self.repo_url, self.repo_name, self.revision)
return self.repo_work_dir

def push(self,
commit_message: str,
branch: Optional[str] = DEFAULT_DATASET_REVISION,
force: bool = False):
"""Push local files to remote, this method will do.
git pull
git add
git commit
git push
Args:
commit_message (str): commit message
branch (Optional[str], optional): which branch to push.
force (Optional[bool]): whether to use forced-push.
"""
if commit_message is None or not isinstance(commit_message, str):
msg = 'commit_message must be provided!'
raise InvalidParameter(msg)

if not isinstance(force, bool):
raise InvalidParameter('force must be bool')

if not self.auth_token:
raise NotLoginException('Must login to push, please login first.')

self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token)
self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name)

remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
self.git_wrapper.pull(self.repo_work_dir)
self.git_wrapper.add(self.repo_work_dir, all_files=True)
self.git_wrapper.commit(self.repo_work_dir, commit_message)
self.git_wrapper.push(
repo_dir=self.repo_work_dir,
token=self.auth_token,
url=remote_url,
local_branch=branch,
remote_branch=branch)

def _get_repo_url(self, dataset_id):
return f'{get_endpoint()}/datasets/{dataset_id}.git'

def _get_remote_url(self):
try:
remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir)
except GitError:
remote = None
return remote

+ 13
- 4
modelscope/hub/utils/utils.py View File

@@ -1,7 +1,9 @@
import hashlib
import os
from typing import Optional

from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
DEFAULT_MODELSCOPE_DOMAIN,
DEFAULT_MODELSCOPE_GROUP,
MODEL_ID_SEPARATOR,
MODELSCOPE_URL_SCHEME)
@@ -22,14 +24,16 @@ def model_id_to_group_owner_name(model_id):
return group_or_owner, name


def get_cache_dir():
def get_cache_dir(model_id: Optional[str] = None):
"""
cache dir precedence:
function parameter > enviroment > ~/.cache/modelscope/hub
"""
default_cache_dir = get_default_cache_dir()
return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
'hub'))
base_path = os.getenv('MODELSCOPE_CACHE',
os.path.join(default_cache_dir, 'hub'))
return base_path if model_id is None else os.path.join(
base_path, model_id + '/')


def get_endpoint():
@@ -38,6 +42,11 @@ def get_endpoint():
return MODELSCOPE_URL_SCHEME + modelscope_domain


def get_dataset_hub_endpoint():
return os.environ.get('HUB_DATASET_ENDPOINT',
DEFAULT_MODELSCOPE_DATA_ENDPOINT)


def compute_hash(file_path):
BUFFER_SIZE = 1024 * 64 # 64k buffer size
sha256_hash = hashlib.sha256()


+ 56
- 0
modelscope/metainfo.py View File

@@ -11,6 +11,7 @@ class Models(object):
"""
# vision models
detection = 'detection'
realtime_object_detection = 'realtime-object-detection'
scrfd = 'scrfd'
classification_model = 'ClassificationModel'
nafnet = 'nafnet'
@@ -19,7 +20,18 @@ class Models(object):
gpen = 'gpen'
product_retrieval_embedding = 'product-retrieval-embedding'
body_2d_keypoints = 'body-2d-keypoints'
body_3d_keypoints = 'body-3d-keypoints'
crowd_counting = 'HRNetCrowdCounting'
panoptic_segmentation = 'swinL-panoptic-segmentation'
image_reid_person = 'passvitb'
video_summarization = 'pgl-video-summarization'
swinL_semantic_segmentation = 'swinL-semantic-segmentation'
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
resnet50_bert = 'resnet50-bert'

# EasyCV models
yolox = 'YOLOX'
segformer = 'Segformer'

# nlp models
bert = 'bert'
@@ -32,8 +44,10 @@ class Models(object):
space_modeling = 'space-modeling'
star = 'star'
tcrf = 'transformer-crf'
lcrf = 'lstm-crf'
bart = 'bart'
gpt3 = 'gpt3'
bert_for_ds = 'bert-for-document-segmentation'

# audio models
sambert_hifigan = 'sambert-hifigan'
@@ -48,12 +62,14 @@ class Models(object):
gemm = 'gemm-generative-multi-modal'
mplug = 'mplug'
diffusion = 'diffusion-text-to-image-synthesis'
team = 'team-multi-modal-similarity'
video_clip = 'video-clip-multi-modal-embedding'


class TaskModels(object):
# nlp task
text_classification = 'text-classification'
information_extraction = 'information-extraction'


class Heads(object):
@@ -63,6 +79,7 @@ class Heads(object):
bert_mlm = 'bert-mlm'
# roberta mlm
roberta_mlm = 'roberta-mlm'
information_extraction = 'information-extraction'


class Pipelines(object):
@@ -84,9 +101,13 @@ class Pipelines(object):
animal_recognition = 'resnet101-animal-recognition'
general_recognition = 'resnet101-general-recognition'
cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding'
hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
body_3d_keypoints = 'canonical_body-3d-keypoints_video'
human_detection = 'resnet18-human-detection'
object_detection = 'vit-object-detection'
easycv_detection = 'easycv-detection'
easycv_segmentation = 'easycv-segmentation'
salient_detection = 'u2net-salient-detection'
image_classification = 'image-classification'
face_detection = 'resnet-face-detection-scrfd10gkps'
@@ -100,6 +121,7 @@ class Pipelines(object):
image_super_resolution = 'rrdb-image-super-resolution'
face_image_generation = 'gan-face-image-generation'
product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
face_recognition = 'ir101-face-recognition-cfglint'
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
image2image_translation = 'image-to-image-translation'
@@ -112,6 +134,11 @@ class Pipelines(object):
tinynas_classification = 'tinynas-classification'
crowd_counting = 'hrnet-crowd-counting'
video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
image_panoptic_segmentation = 'image-panoptic-segmentation'
video_summarization = 'googlenet_pgl_video_summarization'
image_semantic_segmentation = 'image-semantic-segmentation'
image_reid_person = 'passvitb-image-reid-person'
movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'

# nlp tasks
sentence_similarity = 'sentence-similarity'
@@ -129,7 +156,10 @@ class Pipelines(object):
dialog_state_tracking = 'dialog-state-tracking'
zero_shot_classification = 'zero-shot-classification'
text_error_correction = 'text-error-correction'
faq_question_answering = 'faq-question-answering'
conversational_text_to_sql = 'conversational-text-to-sql'
relation_extraction = 'relation-extraction'
document_segmentation = 'document-segmentation'

# audio tasks
sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -146,8 +176,10 @@ class Pipelines(object):
visual_question_answering = 'visual-question-answering'
visual_grounding = 'visual-grounding'
visual_entailment = 'visual-entailment'
multi_modal_similarity = 'multi-modal-similarity'
text_to_image_synthesis = 'text-to-image-synthesis'
video_multi_modal_embedding = 'video-multi-modal-embedding'
image_text_retrieval = 'image-text-retrieval'


class Trainers(object):
@@ -161,6 +193,7 @@ class Trainers(object):
"""

default = 'trainer'
easycv = 'easycv'

# multi-modal trainers
clip_multi_modal_embedding = 'clip-multi-modal-embedding'
@@ -169,12 +202,17 @@ class Trainers(object):
# cv trainers
image_instance_segmentation = 'image-instance-segmentation'
image_portrait_enhancement = 'image-portrait-enhancement'
video_summarization = 'video-summarization'
movie_scene_segmentation = 'movie-scene-segmentation'

# nlp trainers
bert_sentiment_analysis = 'bert-sentiment-analysis'
nlp_base_trainer = 'nlp-base-trainer'
nlp_veco_trainer = 'nlp-veco-trainer'

# audio trainers
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'


class Preprocessors(object):
""" Names for different preprocessor.
@@ -193,6 +231,8 @@ class Preprocessors(object):
image_color_enhance_preprocessor = 'image-color-enhance-preprocessor'
image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
video_summarization_preprocessor = 'video-summarization-preprocessor'
movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'

# nlp preprocessor
sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -210,7 +250,10 @@ class Preprocessors(object):
text_error_correction = 'text-error-correction'
word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
fill_mask = 'fill-mask'
faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
conversational_text_to_sql = 'conversational-text-to-sql'
re_tokenizer = 're-tokenizer'
document_segmentation = 'document-segmentation'

# audio preprocessor
linear_aec_fbank = 'linear-aec-fbank'
@@ -229,6 +272,7 @@ class Metrics(object):

# accuracy
accuracy = 'accuracy'
audio_noise_metric = 'audio-noise-metric'

# metrics for image denoise task
image_denoise_metric = 'image-denoise-metric'
@@ -245,6 +289,9 @@ class Metrics(object):
image_color_enhance_metric = 'image-color-enhance-metric'
# metrics for image-portrait-enhancement task
image_portrait_enhancement_metric = 'image-portrait-enhancement-metric'
video_summarization_metric = 'video-summarization-metric'
# metric for movie-scene-segmentation task
movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'


class Optimizers(object):
@@ -294,3 +341,12 @@ class LR_Schedulers(object):
LinearWarmup = 'LinearWarmup'
ConstantWarmup = 'ConstantWarmup'
ExponentialWarmup = 'ExponentialWarmup'


class Datasets(object):
""" Names for different datasets.
"""
ClsDataset = 'ClsDataset'
SegDataset = 'SegDataset'
DetDataset = 'DetDataset'
DetImagesMixDataset = 'DetImagesMixDataset'

+ 6
- 0
modelscope/metrics/__init__.py View File

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .audio_noise_metric import AudioNoiseMetric
from .base import Metric
from .builder import METRICS, build_metric, task_default_metrics
from .image_color_enhance_metric import ImageColorEnhanceMetric
@@ -14,9 +15,12 @@ if TYPE_CHECKING:
from .sequence_classification_metric import SequenceClassificationMetric
from .text_generation_metric import TextGenerationMetric
from .token_classification_metric import TokenClassificationMetric
from .video_summarization_metric import VideoSummarizationMetric
from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric

else:
_import_structure = {
'audio_noise_metric': ['AudioNoiseMetric'],
'base': ['Metric'],
'builder': ['METRICS', 'build_metric', 'task_default_metrics'],
'image_color_enhance_metric': ['ImageColorEnhanceMetric'],
@@ -28,6 +32,8 @@ else:
'sequence_classification_metric': ['SequenceClassificationMetric'],
'text_generation_metric': ['TextGenerationMetric'],
'token_classification_metric': ['TokenClassificationMetric'],
'video_summarization_metric': ['VideoSummarizationMetric'],
'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
}

import sys


+ 38
- 0
modelscope/metrics/audio_noise_metric.py View File

@@ -0,0 +1,38 @@
from typing import Dict

from modelscope.metainfo import Metrics
from modelscope.metrics.base import Metric
from modelscope.metrics.builder import METRICS, MetricKeys
from modelscope.utils.registry import default_group


@METRICS.register_module(
group_key=default_group, module_name=Metrics.audio_noise_metric)
class AudioNoiseMetric(Metric):
"""
The metric computation class for acoustic noise suppression task.
"""

def __init__(self):
self.loss = []
self.amp_loss = []
self.phase_loss = []
self.sisnr = []

def add(self, outputs: Dict, inputs: Dict):
self.loss.append(outputs['loss'].data.cpu())
self.amp_loss.append(outputs['amp_loss'].data.cpu())
self.phase_loss.append(outputs['phase_loss'].data.cpu())
self.sisnr.append(outputs['sisnr'].data.cpu())

def evaluate(self):
avg_loss = sum(self.loss) / len(self.loss)
avg_sisnr = sum(self.sisnr) / len(self.sisnr)
avg_amp = sum(self.amp_loss) / len(self.amp_loss)
avg_phase = sum(self.phase_loss) / len(self.phase_loss)
total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
return {
'total_loss': total_loss.item(),
'avg_sisnr': avg_sisnr.item(),
MetricKeys.AVERAGE_LOSS: avg_loss.item()
}

+ 14
- 4
modelscope/metrics/builder.py View File

@@ -1,4 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Dict, Mapping, Union

from modelscope.metainfo import Metrics
from modelscope.utils.config import ConfigDict
@@ -15,6 +16,8 @@ class MetricKeys(object):
RECALL = 'recall'
PSNR = 'psnr'
SSIM = 'ssim'
AVERAGE_LOSS = 'avg_loss'
FScore = 'fscore'


task_default_metrics = {
@@ -28,19 +31,26 @@ task_default_metrics = {
Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
Tasks.image_portrait_enhancement:
[Metrics.image_portrait_enhancement_metric],
Tasks.video_summarization: [Metrics.video_summarization_metric],
Tasks.image_captioning: [Metrics.text_gen_metric],
Tasks.visual_question_answering: [Metrics.text_gen_metric],
Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
}


def build_metric(metric_name: str,
def build_metric(metric_cfg: Union[str, Dict],
field: str = default_group,
default_args: dict = None):
""" Build metric given metric_name and field.

Args:
metric_name (:obj:`str`): The metric name.
metric_name (str | dict): The metric name or metric config dict.
field (str, optional): The field of this metric, default value: 'default' for all fields.
default_args (dict, optional): Default initialization arguments.
"""
cfg = ConfigDict({'type': metric_name})
if isinstance(metric_cfg, Mapping):
assert 'type' in metric_cfg
else:
metric_cfg = ConfigDict({'type': metric_cfg})
return build_from_cfg(
cfg, METRICS, group_key=field, default_args=default_args)
metric_cfg, METRICS, group_key=field, default_args=default_args)

+ 52
- 0
modelscope/metrics/movie_scene_segmentation_metric.py View File

@@ -0,0 +1,52 @@
from typing import Dict

import numpy as np

from modelscope.metainfo import Metrics
from modelscope.utils.registry import default_group
from modelscope.utils.tensor_utils import (torch_nested_detach,
torch_nested_numpify)
from .base import Metric
from .builder import METRICS, MetricKeys


@METRICS.register_module(
group_key=default_group,
module_name=Metrics.movie_scene_segmentation_metric)
class MovieSceneSegmentationMetric(Metric):
"""The metric computation class for movie scene segmentation classes.
"""

def __init__(self):
self.preds = []
self.labels = []
self.eps = 1e-5

def add(self, outputs: Dict, inputs: Dict):
preds = outputs['pred']
labels = inputs['label']
self.preds.extend(preds)
self.labels.extend(labels)

def evaluate(self):
gts = np.array(torch_nested_numpify(torch_nested_detach(self.labels)))
prob = np.array(torch_nested_numpify(torch_nested_detach(self.preds)))

gt_one = gts == 1
gt_zero = gts == 0
pred_one = prob == 1
pred_zero = prob == 0

tp = (gt_one * pred_one).sum()
fp = (gt_zero * pred_one).sum()
fn = (gt_one * pred_zero).sum()

precision = 100.0 * tp / (tp + fp + self.eps)
recall = 100.0 * tp / (tp + fn + self.eps)
f1 = 2 * precision * recall / (precision + recall)

return {
MetricKeys.F1: f1,
MetricKeys.RECALL: recall,
MetricKeys.PRECISION: precision
}

+ 78
- 0
modelscope/metrics/video_summarization_metric.py View File

@@ -0,0 +1,78 @@
from typing import Dict

import numpy as np

from modelscope.metainfo import Metrics
from modelscope.models.cv.video_summarization.summarizer import \
generate_summary
from modelscope.utils.registry import default_group
from .base import Metric
from .builder import METRICS, MetricKeys


def evaluate_summary(predicted_summary, user_summary, eval_method):
""" Compare the predicted summary with the user defined one(s).

:param ndarray predicted_summary: The generated summary from our model.
:param ndarray user_summary: The user defined ground truth summaries (or summary).
:param str eval_method: The proposed evaluation method; either 'max' (SumMe) or 'avg' (TVSum).
:return: The reduced fscore based on the eval_method
"""
max_len = max(len(predicted_summary), user_summary.shape[1])
S = np.zeros(max_len, dtype=int)
G = np.zeros(max_len, dtype=int)
S[:len(predicted_summary)] = predicted_summary

f_scores = []
for user in range(user_summary.shape[0]):
G[:user_summary.shape[1]] = user_summary[user]
overlapped = S & G

# Compute precision, recall, f-score
precision = sum(overlapped) / sum(S)
recall = sum(overlapped) / sum(G)
if precision + recall == 0:
f_scores.append(0)
else:
f_score = 2 * precision * recall * 100 / (precision + recall)
f_scores.append(f_score)

if eval_method == 'max':
return max(f_scores)
else:
return sum(f_scores) / len(f_scores)


def calculate_f_score(outputs: Dict, inputs: Dict):
scores = outputs['scores']
scores = scores.squeeze(0).cpu().numpy().tolist()
user_summary = inputs['user_summary'].cpu().numpy()[0]
sb = inputs['change_points'].cpu().numpy()[0]
n_frames = inputs['n_frames'].cpu().numpy()[0]
positions = inputs['positions'].cpu().numpy()[0]
summary = generate_summary([sb], [scores], [n_frames], [positions])[0]
f_score = evaluate_summary(summary, user_summary, 'avg')
return f_score


@METRICS.register_module(
group_key=default_group, module_name=Metrics.video_summarization_metric)
class VideoSummarizationMetric(Metric):
"""The metric for video summarization task.
"""

def __init__(self):
self.inputs = []
self.outputs = []

def add(self, outputs: Dict, inputs: Dict):
self.outputs.append(outputs)
self.inputs.append(inputs)

def evaluate(self):
f_scores = [
calculate_f_score(output, input)
for output, input in zip(self.outputs, self.inputs)
]

return {MetricKeys.FScore: sum(f_scores) / len(f_scores)}

+ 32
- 21
modelscope/models/audio/ans/frcrn.py View File

@@ -75,27 +75,37 @@ class FRCRNModel(TorchModel):
model_bin_file = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
if os.path.exists(model_bin_file):
checkpoint = torch.load(model_bin_file)
self.model.load_state_dict(checkpoint, strict=False)
checkpoint = torch.load(
model_bin_file, map_location=torch.device('cpu'))
if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
self.model.load_state_dict(
checkpoint['state_dict'], strict=False)
else:
self.model.load_state_dict(checkpoint, strict=False)

def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
output = self.model.forward(input)
return {
'spec_l1': output[0],
'wav_l1': output[1],
'mask_l1': output[2],
'spec_l2': output[3],
'wav_l2': output[4],
'mask_l2': output[5]
result_list = self.model.forward(input['noisy'])
output = {
'spec_l1': result_list[0],
'wav_l1': result_list[1],
'mask_l1': result_list[2],
'spec_l2': result_list[3],
'wav_l2': result_list[4],
'mask_l2': result_list[5]
}

def to(self, *args, **kwargs):
self.model = self.model.to(*args, **kwargs)
return self

def eval(self):
self.model = self.model.train(False)
return self
if 'clean' in input:
mix_result = self.model.loss(
input['noisy'], input['clean'], result_list, mode='Mix')
output.update(mix_result)
sisnr_result = self.model.loss(
input['noisy'], input['clean'], result_list, mode='SiSNR')
output.update(sisnr_result)
# logger hooker will use items under 'log_vars'
output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
output['log_vars'].update(
{k: sisnr_result[k].item()
for k in sisnr_result})
return output


class FRCRN(nn.Module):
@@ -110,7 +120,8 @@ class FRCRN(nn.Module):
win_len=400,
win_inc=100,
fft_len=512,
win_type='hanning'):
win_type='hanning',
**kwargs):
r"""
Args:
complex: Whether to use complex networks.
@@ -236,7 +247,7 @@ class FRCRN(nn.Module):
if count != 3:
loss = self.loss_1layer(noisy, est_spec, est_wav, labels,
est_mask, mode)
return loss
return dict(sisnr=loss)

elif mode == 'Mix':
count = 0
@@ -251,7 +262,7 @@ class FRCRN(nn.Module):
amp_loss, phase_loss, SiSNR_loss = self.loss_1layer(
noisy, est_spec, est_wav, labels, est_mask, mode)
loss = amp_loss + phase_loss + SiSNR_loss
return loss, amp_loss, phase_loss
return dict(loss=loss, amp_loss=amp_loss, phase_loss=phase_loss)

def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'):
r""" Compute the loss by mode


+ 1
- 0
modelscope/models/audio/kws/farfield/model.py View File

@@ -33,6 +33,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
ModelFile.TORCH_MODEL_BIN_FILE)
self._model = None
if os.path.exists(model_bin_file):
kwargs.pop('device')
self._model = FSMNSeleNetV2(*args, **kwargs)
checkpoint = torch.load(model_bin_file)
self._model.load_state_dict(checkpoint, strict=False)


+ 40
- 9
modelscope/models/base/base_model.py View File

@@ -1,15 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import os.path as osp
from abc import ABC, abstractmethod
from typing import Dict, Optional, Union

import numpy as np
from typing import Callable, Dict, List, Optional, Union

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models.builder import build_model
from modelscope.utils.checkpoint import save_pretrained
from modelscope.utils.config import Config
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
from modelscope.utils.device import device_placement, verify_device
from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.hub import parse_label_mapping
from modelscope.utils.logger import get_logger
@@ -24,8 +24,7 @@ class Model(ABC):
def __init__(self, model_dir, *args, **kwargs):
self.model_dir = model_dir
device_name = kwargs.get('device', 'gpu')
assert device_name in ['gpu',
'cpu'], 'device should be either cpu or gpu.'
verify_device(device_name)
self._device_name = device_name

def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
@@ -72,6 +71,7 @@ class Model(ABC):
model_name_or_path: str,
revision: Optional[str] = DEFAULT_MODEL_REVISION,
cfg_dict: Config = None,
device: str = None,
*model_args,
**kwargs):
""" Instantiate a model from local directory or remote model repo. Note
@@ -97,7 +97,7 @@ class Model(ABC):
osp.join(local_model_dir, ModelFile.CONFIGURATION))
task_name = cfg.task
model_cfg = cfg.model
# TODO @wenmeng.zwm may should manually initialize model after model building
framework = cfg.framework

if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
model_cfg.type = model_cfg.model_type
@@ -105,10 +105,41 @@ class Model(ABC):
model_cfg.model_dir = local_model_dir
for k, v in kwargs.items():
model_cfg[k] = v
model = build_model(
model_cfg, task_name=task_name, default_args=kwargs)
if device is not None:
model_cfg.device = device
with device_placement(framework, device):
model = build_model(
model_cfg, task_name=task_name, default_args=kwargs)
else:
model = build_model(
model_cfg, task_name=task_name, default_args=kwargs)

# dynamically add pipeline info to model for pipeline inference
if hasattr(cfg, 'pipeline'):
model.pipeline = cfg.pipeline
return model

def save_pretrained(self,
target_folder: Union[str, os.PathLike],
save_checkpoint_names: Union[str, List[str]] = None,
save_function: Callable = None,
config: Optional[dict] = None,
**kwargs):
"""save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded

Args:
target_folder (Union[str, os.PathLike]):
Directory to which to save. Will be created if it doesn't exist.

save_checkpoint_names (Union[str, List[str]]):
The checkpoint names to be saved in the target_folder

save_function (Callable, optional):
The function to use to save the state dictionary.

config (Optional[dict], optional):
The config for the configuration.json, might not be identical with model.config

"""
save_pretrained(self, target_folder, save_checkpoint_names,
save_function, config, **kwargs)

+ 15
- 7
modelscope/models/cv/__init__.py View File

@@ -1,9 +1,17 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

# yapf: disable
from . import (action_recognition, animal_recognition, body_2d_keypoints,
cartoon, cmdssl_video_embedding, crowd_counting, face_detection,
face_generation, image_classification, image_color_enhance,
image_colorization, image_denoise, image_instance_segmentation,
image_portrait_enhancement, image_to_image_generation,
image_to_image_translation, object_detection,
product_retrieval_embedding, salient_detection,
super_resolution, video_single_object_tracking, virual_tryon)
body_3d_keypoints, cartoon, cmdssl_video_embedding,
crowd_counting, face_detection, face_generation,
image_classification, image_color_enhance, image_colorization,
image_denoise, image_instance_segmentation,
image_panoptic_segmentation, image_portrait_enhancement,
image_reid_person, image_semantic_segmentation,
image_to_image_generation, image_to_image_translation,
movie_scene_segmentation, object_detection,
product_retrieval_embedding, realtime_object_detection,
salient_detection, super_resolution,
video_single_object_tracking, video_summarization, virual_tryon)

# yapf: enable

+ 43
- 2
modelscope/models/cv/action_recognition/models.py View File

@@ -1,5 +1,6 @@
import torch.nn as nn

from .s3dg import Inception3D
from .tada_convnext import TadaConvNeXt


@@ -26,11 +27,25 @@ class BaseVideoModel(nn.Module):
super(BaseVideoModel, self).__init__()
# the backbone is created according to meta-architectures
# defined in models/base/backbone.py
self.backbone = TadaConvNeXt(cfg)
if cfg.MODEL.NAME == 'ConvNeXt_tiny':
self.backbone = TadaConvNeXt(cfg)
elif cfg.MODEL.NAME == 'S3DG':
self.backbone = Inception3D(cfg)
else:
error_str = 'backbone {} is not supported, ConvNeXt_tiny or S3DG is supported'.format(
cfg.MODEL.NAME)
raise NotImplementedError(error_str)

# the head is created according to the heads
# defined in models/module_zoo/heads
self.head = BaseHead(cfg)
if cfg.VIDEO.HEAD.NAME == 'BaseHead':
self.head = BaseHead(cfg)
elif cfg.VIDEO.HEAD.NAME == 'AvgHead':
self.head = AvgHead(cfg)
else:
error_str = 'head {} is not supported, BaseHead or AvgHead is supported'.format(
cfg.VIDEO.HEAD.NAME)
raise NotImplementedError(error_str)

def forward(self, x):
x = self.backbone(x)
@@ -88,3 +103,29 @@ class BaseHead(nn.Module):
out = self.activation(out)
out = out.view(out.shape[0], -1)
return out, x.view(x.shape[0], -1)


class AvgHead(nn.Module):
"""
Constructs base head.
"""

def __init__(
self,
cfg,
):
"""
Args:
cfg (Config): global config object.
"""
super(AvgHead, self).__init__()
self.cfg = cfg
self.global_avg_pool = nn.AdaptiveAvgPool3d(1)

def forward(self, x):
if len(x.shape) == 5:
x = self.global_avg_pool(x)
# (N, C, T, H, W) -> (N, T, H, W, C).
x = x.permute((0, 2, 3, 4, 1))
out = x.view(x.shape[0], -1)
return out, x.view(x.shape[0], -1)

+ 301
- 0
modelscope/models/cv/action_recognition/s3dg.py View File

@@ -0,0 +1,301 @@
import torch
import torch.nn as nn


class InceptionBaseConv3D(nn.Module):
"""
Constructs basic inception 3D conv.
Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
"""

def __init__(self,
cfg,
in_planes,
out_planes,
kernel_size,
stride,
padding=0):
super(InceptionBaseConv3D, self).__init__()
self.conv = nn.Conv3d(
in_planes,
out_planes,
kernel_size=kernel_size,
stride=stride,
padding=padding,
bias=False)
self.bn = nn.BatchNorm3d(out_planes)
self.relu = nn.ReLU(inplace=True)

# init
self.conv.weight.data.normal_(
mean=0, std=0.01) # original s3d is truncated normal within 2 std
self.bn.weight.data.fill_(1)
self.bn.bias.data.zero_()

def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x


class InceptionBlock3D(nn.Module):
"""
Element constructing the S3D/S3DG.
See models/base/backbone.py L99-186.

Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
"""

def __init__(self, cfg, in_planes, out_planes):
super(InceptionBlock3D, self).__init__()

_gating = cfg.VIDEO.BACKBONE.BRANCH.GATING

assert len(out_planes) == 6
assert isinstance(out_planes, list)

[
num_out_0_0a, num_out_1_0a, num_out_1_0b, num_out_2_0a,
num_out_2_0b, num_out_3_0b
] = out_planes

self.branch0 = nn.Sequential(
InceptionBaseConv3D(
cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), )
self.branch1 = nn.Sequential(
InceptionBaseConv3D(
cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1),
STConv3d(
cfg,
num_out_1_0a,
num_out_1_0b,
kernel_size=3,
stride=1,
padding=1),
)
self.branch2 = nn.Sequential(
InceptionBaseConv3D(
cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1),
STConv3d(
cfg,
num_out_2_0a,
num_out_2_0b,
kernel_size=3,
stride=1,
padding=1),
)
self.branch3 = nn.Sequential(
nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
InceptionBaseConv3D(
cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1),
)

self.out_channels = sum(
[num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b])

self.gating = _gating
if _gating:
self.gating_b0 = SelfGating(num_out_0_0a)
self.gating_b1 = SelfGating(num_out_1_0b)
self.gating_b2 = SelfGating(num_out_2_0b)
self.gating_b3 = SelfGating(num_out_3_0b)

def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
x2 = self.branch2(x)
x3 = self.branch3(x)
if self.gating:
x0 = self.gating_b0(x0)
x1 = self.gating_b1(x1)
x2 = self.gating_b2(x2)
x3 = self.gating_b3(x3)

out = torch.cat((x0, x1, x2, x3), 1)

return out


class SelfGating(nn.Module):

def __init__(self, input_dim):
super(SelfGating, self).__init__()
self.fc = nn.Linear(input_dim, input_dim)

def forward(self, input_tensor):
"""Feature gating as used in S3D-G"""
spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4])
weights = self.fc(spatiotemporal_average)
weights = torch.sigmoid(weights)
return weights[:, :, None, None, None] * input_tensor


class STConv3d(nn.Module):
"""
Element constructing the S3D/S3DG.
See models/base/backbone.py L99-186.

Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
"""

def __init__(self,
cfg,
in_planes,
out_planes,
kernel_size,
stride,
padding=0):
super(STConv3d, self).__init__()
if isinstance(stride, tuple):
t_stride = stride[0]
stride = stride[-1]
else: # int
t_stride = stride

self.bn_mmt = cfg.BN.MOMENTUM
self.bn_eps = float(cfg.BN.EPS)
self._construct_branch(cfg, in_planes, out_planes, kernel_size, stride,
t_stride, padding)

def _construct_branch(self,
cfg,
in_planes,
out_planes,
kernel_size,
stride,
t_stride,
padding=0):
self.conv1 = nn.Conv3d(
in_planes,
out_planes,
kernel_size=(1, kernel_size, kernel_size),
stride=(1, stride, stride),
padding=(0, padding, padding),
bias=False)
self.conv2 = nn.Conv3d(
out_planes,
out_planes,
kernel_size=(kernel_size, 1, 1),
stride=(t_stride, 1, 1),
padding=(padding, 0, 0),
bias=False)

self.bn1 = nn.BatchNorm3d(
out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
self.bn2 = nn.BatchNorm3d(
out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
self.relu = nn.ReLU(inplace=True)

# init
self.conv1.weight.data.normal_(
mean=0, std=0.01) # original s3d is truncated normal within 2 std
self.conv2.weight.data.normal_(
mean=0, std=0.01) # original s3d is truncated normal within 2 std
self.bn1.weight.data.fill_(1)
self.bn1.bias.data.zero_()
self.bn2.weight.data.fill_(1)
self.bn2.bias.data.zero_()

def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
return x


class Inception3D(nn.Module):
"""
Backbone architecture for I3D/S3DG.
Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
"""

def __init__(self, cfg):
"""
Args:
cfg (Config): global config object.
"""
super(Inception3D, self).__init__()
_input_channel = cfg.DATA.NUM_INPUT_CHANNELS
self._construct_backbone(cfg, _input_channel)

def _construct_backbone(self, cfg, input_channel):
# ------------------- Block 1 -------------------
self.Conv_1a = STConv3d(
cfg, input_channel, 64, kernel_size=7, stride=2, padding=3)

self.block1 = nn.Sequential(self.Conv_1a) # (64, 32, 112, 112)

# ------------------- Block 2 -------------------
self.MaxPool_2a = nn.MaxPool3d(
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
self.Conv_2b = InceptionBaseConv3D(
cfg, 64, 64, kernel_size=1, stride=1)
self.Conv_2c = STConv3d(
cfg, 64, 192, kernel_size=3, stride=1, padding=1)

self.block2 = nn.Sequential(
self.MaxPool_2a, # (64, 32, 56, 56)
self.Conv_2b, # (64, 32, 56, 56)
self.Conv_2c) # (192, 32, 56, 56)

# ------------------- Block 3 -------------------
self.MaxPool_3a = nn.MaxPool3d(
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
self.Mixed_3b = InceptionBlock3D(
cfg, in_planes=192, out_planes=[64, 96, 128, 16, 32, 32])
self.Mixed_3c = InceptionBlock3D(
cfg, in_planes=256, out_planes=[128, 128, 192, 32, 96, 64])

self.block3 = nn.Sequential(
self.MaxPool_3a, # (192, 32, 28, 28)
self.Mixed_3b, # (256, 32, 28, 28)
self.Mixed_3c) # (480, 32, 28, 28)

# ------------------- Block 4 -------------------
self.MaxPool_4a = nn.MaxPool3d(
kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
self.Mixed_4b = InceptionBlock3D(
cfg, in_planes=480, out_planes=[192, 96, 208, 16, 48, 64])
self.Mixed_4c = InceptionBlock3D(
cfg, in_planes=512, out_planes=[160, 112, 224, 24, 64, 64])
self.Mixed_4d = InceptionBlock3D(
cfg, in_planes=512, out_planes=[128, 128, 256, 24, 64, 64])
self.Mixed_4e = InceptionBlock3D(
cfg, in_planes=512, out_planes=[112, 144, 288, 32, 64, 64])
self.Mixed_4f = InceptionBlock3D(
cfg, in_planes=528, out_planes=[256, 160, 320, 32, 128, 128])

self.block4 = nn.Sequential(
self.MaxPool_4a, # (480, 16, 14, 14)
self.Mixed_4b, # (512, 16, 14, 14)
self.Mixed_4c, # (512, 16, 14, 14)
self.Mixed_4d, # (512, 16, 14, 14)
self.Mixed_4e, # (528, 16, 14, 14)
self.Mixed_4f) # (832, 16, 14, 14)

# ------------------- Block 5 -------------------
self.MaxPool_5a = nn.MaxPool3d(
kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
self.Mixed_5b = InceptionBlock3D(
cfg, in_planes=832, out_planes=[256, 160, 320, 32, 128, 128])
self.Mixed_5c = InceptionBlock3D(
cfg, in_planes=832, out_planes=[384, 192, 384, 48, 128, 128])

self.block5 = nn.Sequential(
self.MaxPool_5a, # (832, 8, 7, 7)
self.Mixed_5b, # (832, 8, 7, 7)
self.Mixed_5c) # (1024, 8, 7, 7)

def forward(self, x):
if isinstance(x, dict):
x = x['video']
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.block4(x)
x = self.block5(x)
return x

+ 23
- 0
modelscope/models/cv/body_3d_keypoints/__init__.py View File

@@ -0,0 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:

from .body_3d_pose import BodyKeypointsDetection3D

else:
_import_structure = {
'body_3d_pose': ['BodyKeypointsDetection3D'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 246
- 0
modelscope/models/cv/body_3d_keypoints/body_3d_pose.py View File

@@ -0,0 +1,246 @@
import logging
import os.path as osp
from typing import Any, Dict, List, Union

import numpy as np
import torch

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import (
TemporalModel, TransCan3Dkeys)
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['BodyKeypointsDetection3D']


class KeypointsTypes(object):
POSES_CAMERA = 'poses_camera'
POSES_TRAJ = 'poses_traj'


@MODELS.register_module(
Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints)
class BodyKeypointsDetection3D(TorchModel):

def __init__(self, model_dir: str, *args, **kwargs):

super().__init__(model_dir, *args, **kwargs)

self.model_dir = model_dir
model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION)
self.cfg = Config.from_file(cfg_path)
self._create_model()

if not osp.exists(model_path):
raise IOError(f'{model_path} is not exists.')

if torch.cuda.is_available():
self._device = torch.device('cuda')
else:
self._device = torch.device('cpu')
self.pretrained_state_dict = torch.load(
model_path, map_location=self._device)

self.load_pretrained()
self.to_device(self._device)
self.eval()

def _create_model(self):
self.model_pos = TemporalModel(
self.cfg.model.MODEL.IN_NUM_JOINTS,
self.cfg.model.MODEL.IN_2D_FEATURE,
self.cfg.model.MODEL.OUT_NUM_JOINTS,
filter_widths=self.cfg.model.MODEL.FILTER_WIDTHS,
causal=self.cfg.model.MODEL.CAUSAL,
dropout=self.cfg.model.MODEL.DROPOUT,
channels=self.cfg.model.MODEL.CHANNELS,
dense=self.cfg.model.MODEL.DENSE)

receptive_field = self.model_pos.receptive_field()
self.pad = (receptive_field - 1) // 2
if self.cfg.model.MODEL.CAUSAL:
self.causal_shift = self.pad
else:
self.causal_shift = 0

self.model_traj = TransCan3Dkeys(
in_channels=self.cfg.model.MODEL.IN_NUM_JOINTS
* self.cfg.model.MODEL.IN_2D_FEATURE,
num_features=1024,
out_channels=self.cfg.model.MODEL.OUT_3D_FEATURE,
num_blocks=4,
time_window=receptive_field)

def eval(self):
self.model_pos.eval()
self.model_traj.eval()

def train(self):
self.model_pos.train()
self.model_traj.train()

def to_device(self, device):
self.model_pos = self.model_pos.to(device)
self.model_traj = self.model_traj.to(device)

def load_pretrained(self):
if 'model_pos' in self.pretrained_state_dict:
self.model_pos.load_state_dict(
self.pretrained_state_dict['model_pos'], strict=False)
else:
logging.error(
'Not load model pos from pretrained_state_dict, not in pretrained_state_dict'
)

if 'model_traj' in self.pretrained_state_dict:
self.model_traj.load_state_dict(
self.pretrained_state_dict['model_traj'], strict=False)
else:
logging.error(
'Not load model traj from pretrained_state_dict, not in pretrained_state_dict'
)
logging.info('Load pretrained model done.')

def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
"""Proprocess of 2D input joints.

Args:
input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.

Returns:
Dict[str, Any]: canonical 2d points and root relative joints.
"""
if 'cuda' == input.device.type:
input = input.data.cpu().numpy()
elif 'cpu' == input.device.type:
input = input.data.numpy()
pose2d = input

pose2d_canonical = self.canonicalize_2Ds(
pose2d, self.cfg.model.INPUT.FOCAL_LENGTH,
self.cfg.model.INPUT.CENTER)
pose2d_normalized = self.normalize_screen_coordinates(
pose2d, self.cfg.model.INPUT.RES_W, self.cfg.model.INPUT.RES_H)
pose2d_rr = pose2d_normalized
pose2d_rr[:, 1:] -= pose2d_rr[:, :1]

# expand [NUM_FRAME, NUM_JOINTS, 2] to [1, NUM_FRAME, NUM_JOINTS, 2]
pose2d_rr = np.expand_dims(
np.pad(
pose2d_rr,
((self.pad + self.causal_shift, self.pad - self.causal_shift),
(0, 0), (0, 0)), 'edge'),
axis=0)
pose2d_canonical = np.expand_dims(
np.pad(
pose2d_canonical,
((self.pad + self.causal_shift, self.pad - self.causal_shift),
(0, 0), (0, 0)), 'edge'),
axis=0)
pose2d_rr = torch.from_numpy(pose2d_rr.astype(np.float32))
pose2d_canonical = torch.from_numpy(
pose2d_canonical.astype(np.float32))

inputs_2d = pose2d_rr.clone()
if torch.cuda.is_available():
inputs_2d = inputs_2d.cuda(non_blocking=True)

# Positional model
if self.cfg.model.MODEL.USE_2D_OFFSETS:
inputs_2d[:, :, 0] = 0
else:
inputs_2d[:, :, 1:] += inputs_2d[:, :, :1]

return {
'inputs_2d': inputs_2d,
'pose2d_rr': pose2d_rr,
'pose2d_canonical': pose2d_canonical
}

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
"""3D human pose estimation.

Args:
input (Dict):
inputs_2d: [1, NUM_FRAME, NUM_JOINTS, 2]
pose2d_rr: [1, NUM_FRAME, NUM_JOINTS, 2]
pose2d_canonical: [1, NUM_FRAME, NUM_JOINTS, 2]
NUM_FRAME = max(receptive_filed + video_frame_number, video_frame_number)

Returns:
Dict[str, Any]:
"camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM],
3D human pose keypoints in camera frame.
"camera_traj": Tensor, [1, NUM_FRAME, 1, 3],
root keypoints coordinates in camere frame.
"""
inputs_2d = input['inputs_2d']
pose2d_rr = input['pose2d_rr']
pose2d_canonical = input['pose2d_canonical']
with torch.no_grad():
# predict 3D pose keypoints
predicted_3d_pos = self.model_pos(inputs_2d)

# predict global trajectory
b1, w1, n1, d1 = inputs_2d.shape

input_pose2d_abs = self.get_abs_2d_pts(w1, pose2d_rr,
pose2d_canonical)
b1, w1, n1, d1 = input_pose2d_abs.size()
b2, w2, n2, d2 = predicted_3d_pos.size()

if torch.cuda.is_available():
input_pose2d_abs = input_pose2d_abs.cuda(non_blocking=True)

predicted_3d_traj = self.model_traj(
input_pose2d_abs.view(b1, w1, n1 * d1),
predicted_3d_pos.view(b2 * w2, n2 * d2)).view(b2, w2, -1, 3)

predict_dict = {
KeypointsTypes.POSES_CAMERA: predicted_3d_pos,
KeypointsTypes.POSES_TRAJ: predicted_3d_traj
}

return predict_dict

def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
pose2d_canonical):
pad = self.pad
w = input_video_frame_num - pad * 2

lst_pose2d_rr = []
lst_pose2d_cannoical = []
for i in range(pad, w + pad):
lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1])
lst_pose2d_cannoical.append(pose2d_canonical[:,
i - pad:i + pad + 1])

input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)

if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
input_pose2d_abs = input_pose2d_cannoical.clone()
else:
input_pose2d_abs = input_pose2d_rr.clone()
input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1]

return input_pose2d_abs

def canonicalize_2Ds(self, pos2d, f, c):
cs = np.array([c[0], c[1]]).reshape(1, 1, 2)
fs = np.array([f[0], f[1]]).reshape(1, 1, 2)
canoical_2Ds = (pos2d - cs) / fs
return canoical_2Ds

def normalize_screen_coordinates(self, X, w, h):
assert X.shape[-1] == 2

# Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio
return X / w * 2 - [1, h / w]

+ 233
- 0
modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py View File

@@ -0,0 +1,233 @@
# The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
import torch
import torch.nn as nn


class TemporalModelBase(nn.Module):
"""
Do not instantiate this class.
"""

def __init__(self, num_joints_in, in_features, num_joints_out,
filter_widths, causal, dropout, channels):
super().__init__()

# Validate input
for fw in filter_widths:
assert fw % 2 != 0, 'Only odd filter widths are supported'

self.num_joints_in = num_joints_in
self.in_features = in_features
self.num_joints_out = num_joints_out
self.filter_widths = filter_widths

self.drop = nn.Dropout(dropout)
self.relu = nn.ReLU(inplace=True)

self.pad = [filter_widths[0] // 2]
self.expand_bn = nn.BatchNorm1d(channels, momentum=0.1)
self.shrink = nn.Conv1d(channels, num_joints_out * 3, 1)

def set_bn_momentum(self, momentum):
self.expand_bn.momentum = momentum
for bn in self.layers_bn:
bn.momentum = momentum

def receptive_field(self):
"""
Return the total receptive field of this model as # of frames.
"""
frames = 0
for f in self.pad:
frames += f
return 1 + 2 * frames

def total_causal_shift(self):
"""
Return the asymmetric offset for sequence padding.
The returned value is typically 0 if causal convolutions are disabled,
otherwise it is half the receptive field.
"""
frames = self.causal_shift[0]
next_dilation = self.filter_widths[0]
for i in range(1, len(self.filter_widths)):
frames += self.causal_shift[i] * next_dilation
next_dilation *= self.filter_widths[i]
return frames

def forward(self, x):
assert len(x.shape) == 4
assert x.shape[-2] == self.num_joints_in
assert x.shape[-1] == self.in_features

sz = x.shape[:3]
x = x.view(x.shape[0], x.shape[1], -1)
x = x.permute(0, 2, 1)

x = self._forward_blocks(x)

x = x.permute(0, 2, 1)
x = x.view(sz[0], -1, self.num_joints_out, 3)

return x


class TemporalModel(TemporalModelBase):
"""
Reference 3D pose estimation model with temporal convolutions.
This implementation can be used for all use-cases.
"""

def __init__(self,
num_joints_in,
in_features,
num_joints_out,
filter_widths,
causal=False,
dropout=0.25,
channels=1024,
dense=False):
"""
Initialize this model.

Arguments:
num_joints_in -- number of input joints (e.g. 17 for Human3.6M)
in_features -- number of input features for each joint (typically 2 for 2D input)
num_joints_out -- number of output joints (can be different than input)
filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field
causal -- use causal convolutions instead of symmetric convolutions (for real-time applications)
dropout -- dropout probability
channels -- number of convolution channels
dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment)
"""
super().__init__(num_joints_in, in_features, num_joints_out,
filter_widths, causal, dropout, channels)

self.expand_conv = nn.Conv1d(
num_joints_in * in_features,
channels,
filter_widths[0],
bias=False)

layers_conv = []
layers_bn = []

self.causal_shift = [(filter_widths[0]) // 2 if causal else 0]
next_dilation = filter_widths[0]
for i in range(1, len(filter_widths)):
self.pad.append((filter_widths[i] - 1) * next_dilation // 2)
self.causal_shift.append((filter_widths[i] // 2
* next_dilation) if causal else 0)

layers_conv.append(
nn.Conv1d(
channels,
channels,
filter_widths[i] if not dense else (2 * self.pad[-1] + 1),
dilation=next_dilation if not dense else 1,
bias=False))
layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))
layers_conv.append(
nn.Conv1d(channels, channels, 1, dilation=1, bias=False))
layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1))

next_dilation *= filter_widths[i]

self.layers_conv = nn.ModuleList(layers_conv)
self.layers_bn = nn.ModuleList(layers_bn)

def _forward_blocks(self, x):
x = self.drop(self.relu(self.expand_bn(self.expand_conv(x))))
for i in range(len(self.pad) - 1):
pad = self.pad[i + 1]
shift = self.causal_shift[i + 1]
res = x[:, :, pad + shift:x.shape[2] - pad + shift]
x = self.drop(
self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x))))
x = res + self.drop(
self.relu(self.layers_bn[2 * i + 1](
self.layers_conv[2 * i + 1](x))))

x = self.shrink(x)
return x


# regression of the trajectory
class TransCan3Dkeys(nn.Module):

def __init__(self,
in_channels=74,
num_features=256,
out_channels=44,
time_window=10,
num_blocks=2):
super().__init__()
self.in_channels = in_channels
self.num_features = num_features
self.out_channels = out_channels
self.num_blocks = num_blocks
self.time_window = time_window

self.expand_bn = nn.BatchNorm1d(self.num_features, momentum=0.1)
self.conv1 = nn.Sequential(
nn.ReplicationPad1d(1),
nn.Conv1d(
self.in_channels, self.num_features, kernel_size=3,
bias=False), self.expand_bn, nn.ReLU(inplace=True),
nn.Dropout(p=0.25))
self._make_blocks()
self.pad = nn.ReplicationPad1d(4)
self.relu = nn.ReLU(inplace=True)
self.drop = nn.Dropout(p=0.25)
self.reduce = nn.Conv1d(
self.num_features, self.num_features, kernel_size=self.time_window)
self.embedding_3d_1 = nn.Linear(in_channels // 2 * 3, 500)
self.embedding_3d_2 = nn.Linear(500, 500)
self.LReLU1 = nn.LeakyReLU()
self.LReLU2 = nn.LeakyReLU()
self.LReLU3 = nn.LeakyReLU()
self.out1 = nn.Linear(self.num_features + 500, self.num_features)
self.out2 = nn.Linear(self.num_features, self.out_channels)

def _make_blocks(self):
layers_conv = []
layers_bn = []
for i in range(self.num_blocks):
layers_conv.append(
nn.Conv1d(
self.num_features,
self.num_features,
kernel_size=5,
bias=False,
dilation=2))
layers_bn.append(nn.BatchNorm1d(self.num_features))
self.layers_conv = nn.ModuleList(layers_conv)
self.layers_bn = nn.ModuleList(layers_bn)

def set_bn_momentum(self, momentum):
self.expand_bn.momentum = momentum
for bn in self.layers_bn:
bn.momentum = momentum

def forward(self, p2ds, p3d):
"""
Args:
x - (B x T x J x C)
"""
B, T, C = p2ds.shape
x = p2ds.permute((0, 2, 1))
x = self.conv1(x)
for i in range(self.num_blocks):
pre = x
x = self.pad(x)
x = self.layers_conv[i](x)
x = self.layers_bn[i](x)
x = self.drop(self.relu(x))
x = pre + x
x_2d = self.relu(self.reduce(x))
x_2d = x_2d.view(B, -1)
x_3d = self.LReLU1(self.embedding_3d_1(p3d))
x = torch.cat((x_2d, x_3d), 1)
x = self.LReLU3(self.out1(x))
x = self.out2(x)
return x

+ 2
- 2
modelscope/models/cv/crowd_counting/cc_model.py View File

@@ -13,8 +13,8 @@ from modelscope.utils.constant import Tasks
Tasks.crowd_counting, module_name=Models.crowd_counting)
class HRNetCrowdCounting(TorchModel):

def __init__(self, model_dir: str):
super().__init__(model_dir)
def __init__(self, model_dir: str, **kwargs):
super().__init__(model_dir, **kwargs)

from .hrnet_aspp_relu import HighResolutionNet as HRNet_aspp_relu



+ 25
- 0
modelscope/models/cv/easycv_base.py View File

@@ -0,0 +1,25 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.base import BaseModel
from easycv.utils.ms_utils import EasyCVMeta

from modelscope.models.base import TorchModel


class EasyCVBaseModel(BaseModel, TorchModel):
"""Base model for EasyCV."""

def __init__(self, model_dir=None, args=(), kwargs={}):
kwargs.pop(EasyCVMeta.ARCH, None) # pop useless keys
BaseModel.__init__(self)
TorchModel.__init__(self, model_dir=model_dir)

def forward(self, img, mode='train', **kwargs):
if self.training:
losses = self.forward_train(img, **kwargs)
loss, log_vars = self._parse_losses(losses)
return dict(loss=loss, log_vars=log_vars)
else:
return self.forward_test(img, **kwargs)

def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)

+ 1
- 1
modelscope/models/cv/image_classification/mmcls_model.py View File

@@ -10,7 +10,7 @@ from modelscope.utils.constant import Tasks
Tasks.image_classification, module_name=Models.classification_model)
class ClassificationModel(TorchModel):

def __init__(self, model_dir: str):
def __init__(self, model_dir: str, **kwargs):
import mmcv
from mmcls.models import build_classifier



+ 22
- 0
modelscope/models/cv/image_panoptic_segmentation/__init__.py View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .panseg_model import SwinLPanopticSegmentation

else:
_import_structure = {
'panseg_model': ['SwinLPanopticSegmentation'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 54
- 0
modelscope/models/cv/image_panoptic_segmentation/panseg_model.py View File

@@ -0,0 +1,54 @@
import os.path as osp

import torch

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
Tasks.image_segmentation, module_name=Models.panoptic_segmentation)
class SwinLPanopticSegmentation(TorchModel):

def __init__(self, model_dir: str, **kwargs):
"""str -- model file root."""
super().__init__(model_dir, **kwargs)

from mmcv.runner import load_checkpoint
import mmcv
from mmdet.models import build_detector

config = osp.join(model_dir, 'config.py')

cfg = mmcv.Config.fromfile(config)
if 'pretrained' in cfg.model:
cfg.model.pretrained = None
elif 'init_cfg' in cfg.model.backbone:
cfg.model.backbone.init_cfg = None

# build model
cfg.model.train_cfg = None
self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))

# load model
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
checkpoint = load_checkpoint(
self.model, model_path, map_location='cpu')

self.CLASSES = checkpoint['meta']['CLASSES']
self.num_classes = len(self.CLASSES)
self.cfg = cfg

def inference(self, data):
"""data is dict,contain img and img_metas,follow with mmdet."""

with torch.no_grad():
results = self.model(return_loss=False, rescale=True, **data)
return results

def forward(self, Inputs):
import pdb
pdb.set_trace()
return self.model(**Inputs)

+ 22
- 0
modelscope/models/cv/image_reid_person/__init__.py View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .pass_model import PASS

else:
_import_structure = {
'pass_model': ['PASS'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 136
- 0
modelscope/models/cv/image_reid_person/pass_model.py View File

@@ -0,0 +1,136 @@
# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
# https://github.com/CASIA-IVA-Lab/PASS-reID

import os
from enum import Enum

import torch
import torch.nn as nn

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from .transreid_model import vit_base_patch16_224_TransReID


class Fusions(Enum):
CAT = 'cat'
MEAN = 'mean'


@MODELS.register_module(
Tasks.image_reid_person, module_name=Models.image_reid_person)
class PASS(TorchModel):

def __init__(self, cfg: Config, model_dir: str, **kwargs):
super(PASS, self).__init__(model_dir=model_dir)
size_train = cfg.INPUT.SIZE_TRAIN
sie_coe = cfg.MODEL.SIE_COE
stride_size = cfg.MODEL.STRIDE_SIZE
drop_path = cfg.MODEL.DROP_PATH
drop_out = cfg.MODEL.DROP_OUT
att_drop_rate = cfg.MODEL.ATT_DROP_RATE
gem_pooling = cfg.MODEL.GEM_POOLING
stem_conv = cfg.MODEL.STEM_CONV
weight = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
self.neck_feat = cfg.TEST.NECK_FEAT
self.dropout_rate = cfg.MODEL.DROPOUT_RATE
self.num_classes = cfg.DATASETS.NUM_CLASSES
self.multi_neck = cfg.MODEL.MULTI_NECK
self.feat_fusion = cfg.MODEL.FEAT_FUSION

self.base = vit_base_patch16_224_TransReID(
img_size=size_train,
sie_xishu=sie_coe,
stride_size=stride_size,
drop_path_rate=drop_path,
drop_rate=drop_out,
attn_drop_rate=att_drop_rate,
gem_pool=gem_pooling,
stem_conv=stem_conv)
self.in_planes = self.base.in_planes

if self.feat_fusion == Fusions.CAT.value:
self.classifier = nn.Linear(
self.in_planes * 2, self.num_classes, bias=False)
elif self.feat_fusion == Fusions.MEAN.value:
self.classifier = nn.Linear(
self.in_planes, self.num_classes, bias=False)

if self.multi_neck:
self.bottleneck = nn.BatchNorm1d(self.in_planes)
self.bottleneck.bias.requires_grad_(False)
self.bottleneck_1 = nn.BatchNorm1d(self.in_planes)
self.bottleneck_1.bias.requires_grad_(False)
self.bottleneck_2 = nn.BatchNorm1d(self.in_planes)
self.bottleneck_2.bias.requires_grad_(False)
self.bottleneck_3 = nn.BatchNorm1d(self.in_planes)
self.bottleneck_3.bias.requires_grad_(False)
else:
if self.feat_fusion == Fusions.CAT.value:
self.bottleneck = nn.BatchNorm1d(self.in_planes * 2)
self.bottleneck.bias.requires_grad_(False)
elif self.feat_fusion == Fusions.MEAN.value:
self.bottleneck = nn.BatchNorm1d(self.in_planes)
self.bottleneck.bias.requires_grad_(False)

self.dropout = nn.Dropout(self.dropout_rate)

self.load_param(weight)

def forward(self, input):

global_feat, local_feat_1, local_feat_2, local_feat_3 = self.base(
input)

# single-neck, almost the same performance
if not self.multi_neck:
if self.feat_fusion == Fusions.MEAN.value:
local_feat = local_feat_1 / 3. + local_feat_2 / 3. + local_feat_3 / 3.
final_feat_before = (global_feat + local_feat) / 2
elif self.feat_fusion == Fusions.CAT.value:
final_feat_before = torch.cat(
(global_feat, local_feat_1 / 3. + local_feat_2 / 3.
+ local_feat_3 / 3.),
dim=1)

final_feat_after = self.bottleneck(final_feat_before)
# multi-neck
else:
feat = self.bottleneck(global_feat)
local_feat_1_bn = self.bottleneck_1(local_feat_1)
local_feat_2_bn = self.bottleneck_2(local_feat_2)
local_feat_3_bn = self.bottleneck_3(local_feat_3)

if self.feat_fusion == Fusions.MEAN.value:
final_feat_before = ((global_feat + local_feat_1 / 3
+ local_feat_2 / 3 + local_feat_3 / 3)
/ 2.)
final_feat_after = (feat + local_feat_1_bn / 3
+ local_feat_2_bn / 3
+ local_feat_3_bn / 3) / 2.
elif self.feat_fusion == Fusions.CAT.value:
final_feat_before = torch.cat(
(global_feat, local_feat_1 / 3. + local_feat_2 / 3.
+ local_feat_3 / 3.),
dim=1)
final_feat_after = torch.cat(
(feat, local_feat_1_bn / 3 + local_feat_2_bn / 3
+ local_feat_3_bn / 3),
dim=1)

if self.neck_feat == 'after':
return final_feat_after
else:
return final_feat_before

def load_param(self, trained_path):
param_dict = torch.load(trained_path, map_location='cpu')
for i in param_dict:
try:
self.state_dict()[i.replace('module.',
'')].copy_(param_dict[i])
except Exception:
continue

+ 418
- 0
modelscope/models/cv/image_reid_person/transreid_model.py View File

@@ -0,0 +1,418 @@
# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on
# https://github.com/CASIA-IVA-Lab/PASS-reID

import collections.abc as container_abcs
from functools import partial
from itertools import repeat

import torch
import torch.nn as nn
import torch.nn.functional as F


# From PyTorch internals
def _ntuple(n):

def parse(x):
if isinstance(x, container_abcs.Iterable):
return x
return tuple(repeat(x, n))

return parse


to_2tuple = _ntuple(2)


def vit_base_patch16_224_TransReID(
img_size=(256, 128),
stride_size=16,
drop_path_rate=0.1,
camera=0,
view=0,
local_feature=False,
sie_xishu=1.5,
**kwargs):
model = TransReID(
img_size=img_size,
patch_size=16,
stride_size=stride_size,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
camera=camera,
view=view,
drop_path_rate=drop_path_rate,
sie_xishu=sie_xishu,
local_feature=local_feature,
**kwargs)
return model


def drop_path(x, drop_prob: float = 0., training: bool = False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.

"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0], ) + (1, ) * (
x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(
shape, dtype=x.dtype, device=x.device)
random_tensor.floor_() # binarize
output = x.div(keep_prob) * random_tensor
return output


class TransReID(nn.Module):
"""Transformer-based Object Re-Identification
"""

def __init__(self,
img_size=224,
patch_size=16,
stride_size=16,
in_chans=3,
num_classes=1000,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
camera=0,
view=0,
drop_path_rate=0.,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
local_feature=False,
sie_xishu=1.0,
hw_ratio=1,
gem_pool=False,
stem_conv=False):
super().__init__()
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.local_feature = local_feature
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
stride_size=stride_size,
in_chans=in_chans,
embed_dim=embed_dim,
stem_conv=stem_conv)

num_patches = self.patch_embed.num_patches

self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.part_token1 = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.part_token2 = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.part_token3 = nn.Parameter(torch.zeros(1, 1, embed_dim))

self.cls_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.part1_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.part2_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.part3_pos = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
self.cam_num = camera
self.view_num = view
self.sie_xishu = sie_xishu
self.in_planes = 768
self.gem_pool = gem_pool

# Initialize SIE Embedding
if camera > 1 and view > 1:
self.sie_embed = nn.Parameter(
torch.zeros(camera * view, 1, embed_dim))
elif camera > 1:
self.sie_embed = nn.Parameter(torch.zeros(camera, 1, embed_dim))
elif view > 1:
self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim))

self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule

self.blocks = nn.ModuleList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer) for i in range(depth)
])

self.norm = norm_layer(embed_dim)

# Classifier head
self.fc = nn.Linear(embed_dim,
num_classes) if num_classes > 0 else nn.Identity()

self.gem = GeneralizedMeanPooling()

def forward_features(self, x, camera_id, view_id):
B = x.shape[0]
x = self.patch_embed(x)

cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
part_tokens1 = self.part_token1.expand(B, -1, -1)
part_tokens2 = self.part_token2.expand(B, -1, -1)
part_tokens3 = self.part_token3.expand(B, -1, -1)
x = torch.cat(
(cls_tokens, part_tokens1, part_tokens2, part_tokens3, x), dim=1)

if self.cam_num > 0 and self.view_num > 0:
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[
camera_id * self.view_num + view_id]
elif self.cam_num > 0:
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[camera_id]
elif self.view_num > 0:
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id]
else:
x = x + torch.cat((self.cls_pos, self.part1_pos, self.part2_pos,
self.part3_pos, self.pos_embed),
dim=1)

x = self.pos_drop(x)

if self.local_feature:
for blk in self.blocks[:-1]:
x = blk(x)
return x
else:
for blk in self.blocks:
x = blk(x)

x = self.norm(x)
if self.gem_pool:
gf = self.gem(x[:, 1:].permute(0, 2, 1)).squeeze()
return x[:, 0] + gf
return x[:, 0], x[:, 1], x[:, 2], x[:, 3]

def forward(self, x, cam_label=None, view_label=None):
global_feat, local_feat_1, local_feat_2, local_feat_3 = self.forward_features(
x, cam_label, view_label)
return global_feat, local_feat_1, local_feat_2, local_feat_3


class PatchEmbed(nn.Module):
"""Image to Patch Embedding with overlapping patches
"""

def __init__(self,
img_size=224,
patch_size=16,
stride_size=16,
in_chans=3,
embed_dim=768,
stem_conv=False):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
stride_size_tuple = to_2tuple(stride_size)
self.num_x = (img_size[1] - patch_size[1]) // stride_size_tuple[1] + 1
self.num_y = (img_size[0] - patch_size[0]) // stride_size_tuple[0] + 1
self.num_patches = self.num_x * self.num_y
self.img_size = img_size
self.patch_size = patch_size

self.stem_conv = stem_conv
if self.stem_conv:
hidden_dim = 64
stem_stride = 2
stride_size = patch_size = patch_size[0] // stem_stride
self.conv = nn.Sequential(
nn.Conv2d(
in_chans,
hidden_dim,
kernel_size=7,
stride=stem_stride,
padding=3,
bias=False),
IBN(hidden_dim),
nn.ReLU(inplace=True),
nn.Conv2d(
hidden_dim,
hidden_dim,
kernel_size=3,
stride=1,
padding=1,
bias=False),
IBN(hidden_dim),
nn.ReLU(inplace=True),
nn.Conv2d(
hidden_dim,
hidden_dim,
kernel_size=3,
stride=1,
padding=1,
bias=False),
nn.BatchNorm2d(hidden_dim),
nn.ReLU(inplace=True),
)
in_chans = hidden_dim

self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=stride_size)

def forward(self, x):
if self.stem_conv:
x = self.conv(x)
x = self.proj(x)
x = x.flatten(2).transpose(1, 2) # [64, 8, 768]

return x


class GeneralizedMeanPooling(nn.Module):
"""Applies a 2D power-average adaptive pooling over an input signal composed of several input planes.
The function computed is: :math:`f(X) = pow(sum(pow(X, p)), 1/p)`
- At p = infinity, one gets Max Pooling
- At p = 1, one gets Average Pooling
The output is of size H x W, for any input size.
The number of output features is equal to the number of input planes.
Args:
output_size: the target output size of the image of the form H x W.
Can be a tuple (H, W) or a single H for a square image H x H
H and W can be either a ``int``, or ``None`` which means the size will
be the same as that of the input.
"""

def __init__(self, norm=3, output_size=1, eps=1e-6):
super(GeneralizedMeanPooling, self).__init__()
assert norm > 0
self.p = float(norm)
self.output_size = output_size
self.eps = eps

def forward(self, x):
x = x.clamp(min=self.eps).pow(self.p)
return F.adaptive_avg_pool1d(x, self.output_size).pow(1. / self.p)


class Block(nn.Module):

def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)

def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x


class Attention(nn.Module):

def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim**-0.5

self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)

def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[
2] # make torchscript happy (cannot use tensor as tuple)

attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)

x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x


class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""

def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob

def forward(self, x):
return drop_path(x, self.drop_prob, self.training)


class Mlp(nn.Module):

def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)

def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x

+ 24
- 0
modelscope/models/cv/image_semantic_segmentation/__init__.py View File

@@ -0,0 +1,24 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .semantic_seg_model import SemanticSegmentation
from .segformer import Segformer

else:
_import_structure = {
'semantic_seg_model': ['SemanticSegmentation'],
'segformer': ['Segformer']
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 1
- 0
modelscope/models/cv/image_semantic_segmentation/pan_merge/__init__.py View File

@@ -0,0 +1 @@
from .maskformer_semantic_head import MaskFormerSemanticHead

+ 47
- 0
modelscope/models/cv/image_semantic_segmentation/pan_merge/base_panoptic_fusion_head.py View File

@@ -0,0 +1,47 @@
# Copyright (c) OpenMMLab. All rights reserved.
from abc import ABCMeta, abstractmethod

from mmcv.runner import BaseModule
from mmdet.models.builder import build_loss


class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
"""Base class for panoptic heads."""

def __init__(self,
num_things_classes=80,
num_stuff_classes=53,
test_cfg=None,
loss_panoptic=None,
init_cfg=None,
**kwargs):
super(BasePanopticFusionHead, self).__init__(init_cfg)
self.num_things_classes = num_things_classes
self.num_stuff_classes = num_stuff_classes
self.num_classes = num_things_classes + num_stuff_classes
self.test_cfg = test_cfg

if loss_panoptic:
self.loss_panoptic = build_loss(loss_panoptic)
else:
self.loss_panoptic = None

@property
def with_loss(self):
"""bool: whether the panoptic head contains loss function."""
return self.loss_panoptic is not None

@abstractmethod
def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs):
"""Forward function during training."""

@abstractmethod
def simple_test(self,
img_metas,
det_labels,
mask_preds,
seg_preds,
det_bboxes,
cfg=None,
**kwargs):
"""Test without augmentation."""

+ 57
- 0
modelscope/models/cv/image_semantic_segmentation/pan_merge/maskformer_semantic_head.py View File

@@ -0,0 +1,57 @@
import torch
import torch.nn.functional as F
from mmdet.models.builder import HEADS

from .base_panoptic_fusion_head import BasePanopticFusionHead


@HEADS.register_module()
class MaskFormerSemanticHead(BasePanopticFusionHead):

def __init__(self,
num_things_classes=80,
num_stuff_classes=53,
test_cfg=None,
loss_panoptic=None,
init_cfg=None,
**kwargs):
super().__init__(num_things_classes, num_stuff_classes, test_cfg,
loss_panoptic, init_cfg, **kwargs)

def forward_train(self, **kwargs):
"""MaskFormerFusionHead has no training loss."""
return dict()

def simple_test(self,
mask_cls_results,
mask_pred_results,
img_metas,
rescale=False,
**kwargs):
results = []
for mask_cls_result, mask_pred_result, meta in zip(
mask_cls_results, mask_pred_results, img_metas):
# remove padding
img_height, img_width = meta['img_shape'][:2]
mask_pred_result = mask_pred_result[:, :img_height, :img_width]

if rescale:
# return result in original resolution
ori_height, ori_width = meta['ori_shape'][:2]
mask_pred_result = F.interpolate(
mask_pred_result[:, None],
size=(ori_height, ori_width),
mode='bilinear',
align_corners=False)[:, 0]

# semantic inference
cls_score = F.softmax(mask_cls_result, dim=-1)[..., :-1]
mask_pred = mask_pred_result.sigmoid()
seg_mask = torch.einsum('qc,qhw->chw', cls_score, mask_pred)
# still need softmax and argmax
seg_logit = F.softmax(seg_mask, dim=0)
seg_pred = seg_logit.argmax(dim=0)
seg_pred = seg_pred.cpu().numpy()
results.append(seg_pred)

return results

+ 16
- 0
modelscope/models/cv/image_semantic_segmentation/segformer.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.segmentation import EncoderDecoder

from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks


@MODELS.register_module(
group_key=Tasks.image_segmentation, module_name=Models.segformer)
class Segformer(EasyCVBaseModel, EncoderDecoder):

def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
EncoderDecoder.__init__(self, *args, **kwargs)

+ 76
- 0
modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py View File

@@ -0,0 +1,76 @@
import os.path as osp

import numpy as np
import torch

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.image_semantic_segmentation import (pan_merge,
vit_adapter)
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
Tasks.image_segmentation, module_name=Models.swinL_semantic_segmentation)
@MODELS.register_module(
Tasks.image_segmentation,
module_name=Models.vitadapter_semantic_segmentation)
class SemanticSegmentation(TorchModel):

def __init__(self, model_dir: str, **kwargs):
"""str -- model file root."""
super().__init__(model_dir, **kwargs)

from mmcv.runner import load_checkpoint
import mmcv
from mmdet.models import build_detector

config = osp.join(model_dir, 'mmcv_config.py')
cfg = mmcv.Config.fromfile(config)
if 'pretrained' in cfg.model:
cfg.model.pretrained = None
elif 'init_cfg' in cfg.model.backbone:
cfg.model.backbone.init_cfg = None

# build model
cfg.model.train_cfg = None
self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))

# load model
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
_ = load_checkpoint(self.model, model_path, map_location='cpu')

self.CLASSES = cfg['CLASSES'] # list
self.PALETTE = cfg['PALETTE'] # list

self.num_classes = len(self.CLASSES)
self.cfg = cfg

def forward(self, Inputs):
return self.model(**Inputs)

def postprocess(self, Inputs):
semantic_result = Inputs[0]

ids = np.unique(semantic_result)[::-1]
legal_indices = ids != self.model.num_classes # for VOID label
ids = ids[legal_indices]

segms = (semantic_result[None] == ids[:, None, None])
masks = [it.astype(np.int) for it in segms]
labels_txt = np.array(self.CLASSES)[ids].tolist()

results = {
OutputKeys.MASKS: masks,
OutputKeys.LABELS: labels_txt,
OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
}
return results

def inference(self, data):
with torch.no_grad():
results = self.model(return_loss=False, rescale=True, **data)

return results

+ 3
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/__init__.py View File

@@ -0,0 +1,3 @@
from .models import backbone, decode_heads, segmentors
from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler,
seg_resize)

+ 3
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/__init__.py View File

@@ -0,0 +1,3 @@
from .backbone import BASEBEiT, BEiTAdapter
from .decode_heads import Mask2FormerHeadFromMMSeg
from .segmentors import EncoderDecoderMask2Former

+ 4
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/__init__.py View File

@@ -0,0 +1,4 @@
from .base import BASEBEiT
from .beit_adapter import BEiTAdapter

__all__ = ['BEiTAdapter', 'BASEBEiT']

+ 523
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/adapter_modules.py View File

@@ -0,0 +1,523 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git

import logging
from functools import partial

import torch
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmdet.models.utils.transformer import MultiScaleDeformableAttention
from timm.models.layers import DropPath

_logger = logging.getLogger(__name__)


def get_reference_points(spatial_shapes, device):
reference_points_list = []
for lvl, (H_, W_) in enumerate(spatial_shapes):
ref_y, ref_x = torch.meshgrid(
torch.linspace(
0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
torch.linspace(
0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
ref_y = ref_y.reshape(-1)[None] / H_
ref_x = ref_x.reshape(-1)[None] / W_
ref = torch.stack((ref_x, ref_y), -1)
reference_points_list.append(ref)
reference_points = torch.cat(reference_points_list, 1)
reference_points = reference_points[:, :, None]
return reference_points


def deform_inputs(x):
bs, c, h, w = x.shape
spatial_shapes = torch.as_tensor([(h // 8, w // 8), (h // 16, w // 16),
(h // 32, w // 32)],
dtype=torch.long,
device=x.device)
level_start_index = torch.cat((spatial_shapes.new_zeros(
(1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
reference_points = get_reference_points([(h // 16, w // 16)], x.device)
deform_inputs1 = [reference_points, spatial_shapes, level_start_index]

spatial_shapes = torch.as_tensor([(h // 16, w // 16)],
dtype=torch.long,
device=x.device)
level_start_index = torch.cat((spatial_shapes.new_zeros(
(1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
reference_points = get_reference_points([(h // 8, w // 8),
(h // 16, w // 16),
(h // 32, w // 32)], x.device)
deform_inputs2 = [reference_points, spatial_shapes, level_start_index]

return deform_inputs1, deform_inputs2


class ConvFFN(nn.Module):

def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.dwconv = DWConv(hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)

def forward(self, x, H, W):
x = self.fc1(x)
x = self.dwconv(x, H, W)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x


class DWConv(nn.Module):

def __init__(self, dim=768):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)

def forward(self, x, H, W):
B, N, C = x.shape
n = N // 21
x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2,
W * 2).contiguous()
x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H,
W).contiguous()
x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2,
W // 2).contiguous()
x1 = self.dwconv(x1).flatten(2).transpose(1, 2)
x2 = self.dwconv(x2).flatten(2).transpose(1, 2)
x3 = self.dwconv(x3).flatten(2).transpose(1, 2)
x = torch.cat([x1, x2, x3], dim=1)
return x


class Extractor(nn.Module):

def __init__(self,
dim,
num_heads=6,
n_points=4,
n_levels=1,
deform_ratio=1.0,
with_cffn=True,
cffn_ratio=0.25,
drop=0.,
drop_path=0.,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
with_cp=False):
super().__init__()
self.query_norm = norm_layer(dim)
self.feat_norm = norm_layer(dim)
self.attn = MultiScaleDeformableAttention(
embed_dims=dim,
num_heads=num_heads,
num_levels=n_levels,
num_points=n_points,
batch_first=True)

# modify to fit the deform_ratio
value_proj_in_features = self.attn.value_proj.weight.shape[0]
value_proj_out_features = int(value_proj_in_features * deform_ratio)
self.attn.value_proj = nn.Linear(value_proj_in_features,
value_proj_out_features)
self.attn.output_proj = nn.Linear(value_proj_out_features,
value_proj_in_features)

self.with_cffn = with_cffn
self.with_cp = with_cp
if with_cffn:
self.ffn = ConvFFN(
in_features=dim,
hidden_features=int(dim * cffn_ratio),
drop=drop)
self.ffn_norm = norm_layer(dim)
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()

def forward(self, query, reference_points, feat, spatial_shapes,
level_start_index, H, W):

def _inner_forward(query, feat):
attn = self.attn(
query=self.query_norm(query),
key=None,
value=self.feat_norm(feat),
identity=None,
query_pos=None,
key_padding_mask=None,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index)

query = query + attn

if self.with_cffn:
query = query + self.drop_path(
self.ffn(self.ffn_norm(query), H, W))
return query

if self.with_cp and query.requires_grad:
query = cp.checkpoint(_inner_forward, query, feat)
else:
query = _inner_forward(query, feat)

return query


class Injector(nn.Module):

def __init__(self,
dim,
num_heads=6,
n_points=4,
n_levels=1,
deform_ratio=1.0,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
init_values=0.,
with_cp=False):
super().__init__()
self.with_cp = with_cp
self.query_norm = norm_layer(dim)
self.feat_norm = norm_layer(dim)
self.attn = MultiScaleDeformableAttention(
embed_dims=dim,
num_heads=num_heads,
num_levels=n_levels,
num_points=n_points,
batch_first=True)

# modify to fit the deform_ratio
value_proj_in_features = self.attn.value_proj.weight.shape[0]
value_proj_out_features = int(value_proj_in_features * deform_ratio)
self.attn.value_proj = nn.Linear(value_proj_in_features,
value_proj_out_features)
self.attn.output_proj = nn.Linear(value_proj_out_features,
value_proj_in_features)

self.gamma = nn.Parameter(
init_values * torch.ones((dim)), requires_grad=True)

def forward(self, query, reference_points, feat, spatial_shapes,
level_start_index):

def _inner_forward(query, feat):
input_query = self.query_norm(query)
input_value = self.feat_norm(feat)
attn = self.attn(
query=input_query,
key=None,
value=input_value,
identity=None,
query_pos=None,
key_padding_mask=None,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index)
return query + self.gamma * attn

if self.with_cp and query.requires_grad:
query = cp.checkpoint(_inner_forward, query, feat)
else:
query = _inner_forward(query, feat)

return query


class InteractionBlock(nn.Module):

def __init__(self,
dim,
num_heads=6,
n_points=4,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
drop=0.,
drop_path=0.,
with_cffn=True,
cffn_ratio=0.25,
init_values=0.,
deform_ratio=1.0,
extra_extractor=False,
with_cp=False):
super().__init__()

self.injector = Injector(
dim=dim,
n_levels=3,
num_heads=num_heads,
init_values=init_values,
n_points=n_points,
norm_layer=norm_layer,
deform_ratio=deform_ratio,
with_cp=with_cp)
self.extractor = Extractor(
dim=dim,
n_levels=1,
num_heads=num_heads,
n_points=n_points,
norm_layer=norm_layer,
deform_ratio=deform_ratio,
with_cffn=with_cffn,
cffn_ratio=cffn_ratio,
drop=drop,
drop_path=drop_path,
with_cp=with_cp)
if extra_extractor:
self.extra_extractors = nn.Sequential(*[
Extractor(
dim=dim,
num_heads=num_heads,
n_points=n_points,
norm_layer=norm_layer,
with_cffn=with_cffn,
cffn_ratio=cffn_ratio,
deform_ratio=deform_ratio,
drop=drop,
drop_path=drop_path,
with_cp=with_cp) for _ in range(2)
])
else:
self.extra_extractors = None

def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W):
x = self.injector(
query=x,
reference_points=deform_inputs1[0],
feat=c,
spatial_shapes=deform_inputs1[1],
level_start_index=deform_inputs1[2])
for idx, blk in enumerate(blocks):
x = blk(x, H, W)
c = self.extractor(
query=c,
reference_points=deform_inputs2[0],
feat=x,
spatial_shapes=deform_inputs2[1],
level_start_index=deform_inputs2[2],
H=H,
W=W)
if self.extra_extractors is not None:
for extractor in self.extra_extractors:
c = extractor(
query=c,
reference_points=deform_inputs2[0],
feat=x,
spatial_shapes=deform_inputs2[1],
level_start_index=deform_inputs2[2],
H=H,
W=W)
return x, c


class InteractionBlockWithCls(nn.Module):

def __init__(self,
dim,
num_heads=6,
n_points=4,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
drop=0.,
drop_path=0.,
with_cffn=True,
cffn_ratio=0.25,
init_values=0.,
deform_ratio=1.0,
extra_extractor=False,
with_cp=False):
super().__init__()

self.injector = Injector(
dim=dim,
n_levels=3,
num_heads=num_heads,
init_values=init_values,
n_points=n_points,
norm_layer=norm_layer,
deform_ratio=deform_ratio,
with_cp=with_cp)
self.extractor = Extractor(
dim=dim,
n_levels=1,
num_heads=num_heads,
n_points=n_points,
norm_layer=norm_layer,
deform_ratio=deform_ratio,
with_cffn=with_cffn,
cffn_ratio=cffn_ratio,
drop=drop,
drop_path=drop_path,
with_cp=with_cp)
if extra_extractor:
self.extra_extractors = nn.Sequential(*[
Extractor(
dim=dim,
num_heads=num_heads,
n_points=n_points,
norm_layer=norm_layer,
with_cffn=with_cffn,
cffn_ratio=cffn_ratio,
deform_ratio=deform_ratio,
drop=drop,
drop_path=drop_path,
with_cp=with_cp) for _ in range(2)
])
else:
self.extra_extractors = None

def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W):
x = self.injector(
query=x,
reference_points=deform_inputs1[0],
feat=c,
spatial_shapes=deform_inputs1[1],
level_start_index=deform_inputs1[2])
x = torch.cat((cls, x), dim=1)
for idx, blk in enumerate(blocks):
x = blk(x, H, W)
cls, x = x[:, :1, ], x[:, 1:, ]
c = self.extractor(
query=c,
reference_points=deform_inputs2[0],
feat=x,
spatial_shapes=deform_inputs2[1],
level_start_index=deform_inputs2[2],
H=H,
W=W)
if self.extra_extractors is not None:
for extractor in self.extra_extractors:
c = extractor(
query=c,
reference_points=deform_inputs2[0],
feat=x,
spatial_shapes=deform_inputs2[1],
level_start_index=deform_inputs2[2],
H=H,
W=W)
return x, c, cls


class SpatialPriorModule(nn.Module):

def __init__(self, inplanes=64, embed_dim=384, with_cp=False):
super().__init__()
self.with_cp = with_cp

self.stem = nn.Sequential(*[
nn.Conv2d(
3, inplanes, kernel_size=3, stride=2, padding=1, bias=False),
nn.SyncBatchNorm(inplanes),
nn.ReLU(inplace=True),
nn.Conv2d(
inplanes,
inplanes,
kernel_size=3,
stride=1,
padding=1,
bias=False),
nn.SyncBatchNorm(inplanes),
nn.ReLU(inplace=True),
nn.Conv2d(
inplanes,
inplanes,
kernel_size=3,
stride=1,
padding=1,
bias=False),
nn.SyncBatchNorm(inplanes),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
])
self.conv2 = nn.Sequential(*[
nn.Conv2d(
inplanes,
2 * inplanes,
kernel_size=3,
stride=2,
padding=1,
bias=False),
nn.SyncBatchNorm(2 * inplanes),
nn.ReLU(inplace=True)
])
self.conv3 = nn.Sequential(*[
nn.Conv2d(
2 * inplanes,
4 * inplanes,
kernel_size=3,
stride=2,
padding=1,
bias=False),
nn.SyncBatchNorm(4 * inplanes),
nn.ReLU(inplace=True)
])
self.conv4 = nn.Sequential(*[
nn.Conv2d(
4 * inplanes,
4 * inplanes,
kernel_size=3,
stride=2,
padding=1,
bias=False),
nn.SyncBatchNorm(4 * inplanes),
nn.ReLU(inplace=True)
])
self.fc1 = nn.Conv2d(
inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True)
self.fc2 = nn.Conv2d(
2 * inplanes,
embed_dim,
kernel_size=1,
stride=1,
padding=0,
bias=True)
self.fc3 = nn.Conv2d(
4 * inplanes,
embed_dim,
kernel_size=1,
stride=1,
padding=0,
bias=True)
self.fc4 = nn.Conv2d(
4 * inplanes,
embed_dim,
kernel_size=1,
stride=1,
padding=0,
bias=True)

def forward(self, x):

def _inner_forward(x):
c1 = self.stem(x)
c2 = self.conv2(c1)
c3 = self.conv3(c2)
c4 = self.conv4(c3)
c1 = self.fc1(c1)
c2 = self.fc2(c2)
c3 = self.fc3(c3)
c4 = self.fc4(c4)

bs, dim, _, _ = c1.shape

c2 = c2.view(bs, dim, -1).transpose(1, 2) # 8s
c3 = c3.view(bs, dim, -1).transpose(1, 2) # 16s
c4 = c4.view(bs, dim, -1).transpose(1, 2) # 32s

return c1, c2, c3, c4

if self.with_cp and x.requires_grad:
outs = cp.checkpoint(_inner_forward, x)
else:
outs = _inner_forward(x)
return outs

+ 3
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/__init__.py View File

@@ -0,0 +1,3 @@
from .beit import BASEBEiT

__all__ = ['BASEBEiT']

+ 476
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/base/beit.py View File

@@ -0,0 +1,476 @@
# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
# Github source: https://github.com/microsoft/unilm/tree/master/beit
# This implementation refers to
# https://github.com/czczup/ViT-Adapter.git
import math
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from mmcv.runner import _load_checkpoint
from mmdet.models.builder import BACKBONES
from mmdet.utils import get_root_logger
from timm.models.layers import drop_path, to_2tuple, trunc_normal_


class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks)."""

def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob

def forward(self, x):
return drop_path(x, self.drop_prob, self.training)

def extra_repr(self) -> str:
return 'p={}'.format(self.drop_prob)


class Mlp(nn.Module):

def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)

def forward(self, x):
x = self.fc1(x)
x = self.act(x)
# commit dropout for the original BERT implement
x = self.fc2(x)
x = self.drop(x)
return x


class Attention(nn.Module):

def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
window_size=None,
attn_head_dim=None):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim**-0.5

self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
else:
self.q_bias = None
self.v_bias = None

if window_size:
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0]
- 1) * (2 * window_size[1] - 1) + 3
self.relative_position_bias_table = nn.Parameter(
torch.zeros(self.num_relative_distance,
num_heads)) # 2*Wh-1 * 2*Ww-1, nH
# cls to token & token 2 cls & cls to cls

# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(window_size[0])
coords_w = torch.arange(window_size[1])
coords = torch.stack(torch.meshgrid([coords_h,
coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :,
0] += window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer('relative_position_index',
relative_position_index)

else:
self.window_size = None
self.relative_position_bias_table = None
self.relative_position_index = None

self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)

def forward(self, x, rel_pos_bias=None):
B, N, C = x.shape
qkv_bias = None
if self.q_bias is not None:
qkv_bias = torch.cat(
(self.q_bias,
torch.zeros_like(self.v_bias,
requires_grad=False), self.v_bias))

qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[
2] # make torchscript happy (cannot use tensor as tuple)

q = q * self.scale
attn = (q @ k.transpose(-2, -1))

if self.relative_position_bias_table is not None:
relative_position_bias = \
self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww

attn = attn + relative_position_bias.unsqueeze(0)

if rel_pos_bias is not None:
attn = attn + rel_pos_bias

attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)

x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x


class Block(nn.Module):

def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
init_values=None,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
window_size=None,
attn_head_dim=None,
with_cp=False):
super().__init__()
self.with_cp = with_cp
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
window_size=window_size,
attn_head_dim=attn_head_dim)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)

if init_values is not None:
self.gamma_1 = nn.Parameter(
init_values * torch.ones((dim)), requires_grad=True)
self.gamma_2 = nn.Parameter(
init_values * torch.ones((dim)), requires_grad=True)
else:
self.gamma_1, self.gamma_2 = None, None

def forward(self, x, H, W, rel_pos_bias=None):

def _inner_forward(x):
if self.gamma_1 is None:
x = x + self.drop_path(
self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_1 * self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x

if self.with_cp and x.requires_grad:
x = cp.checkpoint(_inner_forward, x)
else:
x = _inner_forward(x)
return x


class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""

def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * (
img_size[0] // patch_size[0])
self.patch_shape = (img_size[0] // patch_size[0],
img_size[1] // patch_size[1])
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches

self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

def forward(self, x, **kwargs):
B, C, H, W = x.shape
# FIXME look at relaxing size constraints
# assert H == self.img_size[0] and W == self.img_size[1], \
# f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x)
Hp, Wp = x.shape[2], x.shape[3]

x = x.flatten(2).transpose(1, 2)
return x, Hp, Wp


class HybridEmbed(nn.Module):
""" CNN Feature Map Embedding
Extract feature map from CNN, flatten, project to embedding dim.
"""

def __init__(self,
backbone,
img_size=224,
feature_size=None,
in_chans=3,
embed_dim=768):
super().__init__()
assert isinstance(backbone, nn.Module)
img_size = to_2tuple(img_size)
self.img_size = img_size
self.backbone = backbone
if feature_size is None:
with torch.no_grad():
# FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
# map for all networks, the feature metadata has reliable channel and stride info, but using
# stride to calc feature dim requires info about padding of each stage that isn't captured.
training = backbone.training
if training:
backbone.eval()
o = self.backbone(
torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
feature_size = o.shape[-2:]
feature_dim = o.shape[1]
backbone.train(training)
else:
feature_size = to_2tuple(feature_size)
feature_dim = self.backbone.feature_info.channels()[-1]
self.num_patches = feature_size[0] * feature_size[1]
self.proj = nn.Linear(feature_dim, embed_dim)

def forward(self, x):
x = self.backbone(x)[-1]
x = x.flatten(2).transpose(1, 2)
x = self.proj(x)
return x


class RelativePositionBias(nn.Module):

def __init__(self, window_size, num_heads):
super().__init__()
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0]
- 1) * (2 * window_size[1] - 1) + 3
self.relative_position_bias_table = nn.Parameter(
torch.zeros(self.num_relative_distance,
num_heads)) # 2*Wh-1 * 2*Ww-1, nH
# cls to token & token 2 cls & cls to cls

# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(window_size[0])
coords_w = torch.arange(window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
relative_position_index[1:,
1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1

self.register_buffer('relative_position_index',
relative_position_index)

def forward(self):
relative_position_bias = \
self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
return relative_position_bias.permute(
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww


@BACKBONES.register_module()
class BASEBEiT(nn.Module):
""" Vision Transformer with support for patch or hybrid CNN input stage
"""

def __init__(self,
img_size=512,
patch_size=16,
in_chans=3,
num_classes=80,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
hybrid_backbone=None,
norm_layer=None,
init_values=None,
use_checkpoint=False,
use_abs_pos_emb=False,
use_rel_pos_bias=True,
use_shared_rel_pos_bias=False,
pretrained=None,
with_cp=False):
super().__init__()
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
self.norm_layer = norm_layer
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.drop_path_rate = drop_path_rate
if hybrid_backbone is not None:
self.patch_embed = HybridEmbed(
hybrid_backbone,
img_size=img_size,
in_chans=in_chans,
embed_dim=embed_dim)
else:
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches

self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
if use_abs_pos_emb:
self.pos_embed = nn.Parameter(
torch.zeros(1, num_patches + 1, embed_dim))
else:
self.pos_embed = None
self.pos_drop = nn.Dropout(p=drop_rate)

if use_shared_rel_pos_bias:
self.rel_pos_bias = RelativePositionBias(
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
else:
self.rel_pos_bias = None

dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule
self.use_rel_pos_bias = use_rel_pos_bias
self.use_checkpoint = use_checkpoint
self.blocks = nn.ModuleList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
with_cp=with_cp,
init_values=init_values,
window_size=self.patch_embed.patch_shape
if use_rel_pos_bias else None) for i in range(depth)
])

trunc_normal_(self.cls_token, std=.02)
self.apply(self._init_weights)
self.init_weights(pretrained)

def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.

Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if isinstance(pretrained, str):
logger = get_root_logger()
init_cfg = dict(type='Pretrained', checkpoint=pretrained)

checkpoint = _load_checkpoint(
init_cfg['checkpoint'], logger=logger, map_location='cpu')
state_dict = self.resize_rel_pos_embed(checkpoint)
self.load_state_dict(state_dict, False)

def fix_init_weight(self):

def rescale(param, layer_id):
param.div_(math.sqrt(2.0 * layer_id))

for layer_id, layer in enumerate(self.blocks):
rescale(layer.attn.proj.weight.data, layer_id + 1)
rescale(layer.mlp.fc2.weight.data, layer_id + 1)

def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)

def get_num_layers(self):
return len(self.blocks)

+ 169
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/backbone/beit_adapter.py View File

@@ -0,0 +1,169 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git
import logging
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.models.builder import BACKBONES
from mmdet.models.utils.transformer import MultiScaleDeformableAttention
from timm.models.layers import DropPath, trunc_normal_
from torch.nn.init import normal_

from .adapter_modules import InteractionBlockWithCls as InteractionBlock
from .adapter_modules import SpatialPriorModule, deform_inputs
from .base.beit import BASEBEiT

_logger = logging.getLogger(__name__)


@BACKBONES.register_module()
class BEiTAdapter(BASEBEiT):

def __init__(self,
pretrain_size=224,
conv_inplane=64,
n_points=4,
deform_num_heads=6,
init_values=0.,
cffn_ratio=0.25,
deform_ratio=1.0,
with_cffn=True,
interaction_indexes=None,
add_vit_feature=True,
with_cp=False,
*args,
**kwargs):

super().__init__(
init_values=init_values, with_cp=with_cp, *args, **kwargs)

self.num_block = len(self.blocks)
self.pretrain_size = (pretrain_size, pretrain_size)
self.flags = [
i for i in range(-1, self.num_block, self.num_block // 4)
][1:]
self.interaction_indexes = interaction_indexes
self.add_vit_feature = add_vit_feature
embed_dim = self.embed_dim

self.level_embed = nn.Parameter(torch.zeros(3, embed_dim))
self.spm = SpatialPriorModule(
inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False)
self.interactions = nn.Sequential(*[
InteractionBlock(
dim=embed_dim,
num_heads=deform_num_heads,
n_points=n_points,
init_values=init_values,
drop_path=self.drop_path_rate,
norm_layer=self.norm_layer,
with_cffn=with_cffn,
cffn_ratio=cffn_ratio,
deform_ratio=deform_ratio,
extra_extractor=True if i == len(interaction_indexes)
- 1 else False,
with_cp=with_cp) for i in range(len(interaction_indexes))
])

self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2)
self.norm1 = nn.SyncBatchNorm(embed_dim)
self.norm2 = nn.SyncBatchNorm(embed_dim)
self.norm3 = nn.SyncBatchNorm(embed_dim)
self.norm4 = nn.SyncBatchNorm(embed_dim)

self.up.apply(self._init_weights)
self.spm.apply(self._init_weights)
self.interactions.apply(self._init_weights)
self.apply(self._init_deform_weights)
normal_(self.level_embed)

def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()

def _get_pos_embed(self, pos_embed, H, W):
pos_embed = pos_embed.reshape(1, self.pretrain_size[0] // 16,
self.pretrain_size[1] // 16,
-1).permute(0, 3, 1, 2)
pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
reshape(1, -1, H * W).permute(0, 2, 1)
return pos_embed

def _init_deform_weights(self, m):
if isinstance(m, MultiScaleDeformableAttention):
m.init_weights()

def _add_level_embed(self, c2, c3, c4):
c2 = c2 + self.level_embed[0]
c3 = c3 + self.level_embed[1]
c4 = c4 + self.level_embed[2]
return c2, c3, c4

def forward(self, x):
deform_inputs1, deform_inputs2 = deform_inputs(x)

# SPM forward
c1, c2, c3, c4 = self.spm(x)
c2, c3, c4 = self._add_level_embed(c2, c3, c4)
c = torch.cat([c2, c3, c4], dim=1)

# Patch Embedding forward
x, H, W = self.patch_embed(x)
bs, n, dim = x.shape
cls = self.cls_token.expand(
bs, -1, -1) # stole cls_tokens impl from Phil Wang, thanks

if self.pos_embed is not None:
pos_embed = self._get_pos_embed(self.pos_embed, H, W)
x = x + pos_embed
x = self.pos_drop(x)

# Interaction
outs = list()
for i, layer in enumerate(self.interactions):
indexes = self.interaction_indexes[i]
x, c, cls = layer(x, c, cls,
self.blocks[indexes[0]:indexes[-1] + 1],
deform_inputs1, deform_inputs2, H, W)
outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous())

# Split & Reshape
c2 = c[:, 0:c2.size(1), :]
c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :]
c4 = c[:, c2.size(1) + c3.size(1):, :]

c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous()
c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous()
c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous()
c1 = self.up(c2) + c1

if self.add_vit_feature:
x1, x2, x3, x4 = outs
x1 = F.interpolate(
x1, scale_factor=4, mode='bilinear', align_corners=False)
x2 = F.interpolate(
x2, scale_factor=2, mode='bilinear', align_corners=False)
x4 = F.interpolate(
x4, scale_factor=0.5, mode='bilinear', align_corners=False)
c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4

# Final Norm
f1 = self.norm1(c1)
f2 = self.norm2(c2)
f3 = self.norm3(c3)
f4 = self.norm4(c4)
return [f1, f2, f3, f4]

+ 3
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/__init__.py View File

@@ -0,0 +1,3 @@
from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg

__all__ = ['Mask2FormerHeadFromMMSeg']

+ 267
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/base_decode_head.py View File

@@ -0,0 +1,267 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git
from abc import ABCMeta, abstractmethod

import torch
import torch.nn as nn
from mmcv.runner import BaseModule, auto_fp16, force_fp32
from mmdet.models.builder import build_loss
from mmdet.models.losses import accuracy

from ...utils import build_pixel_sampler, seg_resize


class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
"""Base class for BaseDecodeHead.

Args:
in_channels (int|Sequence[int]): Input channels.
channels (int): Channels after modules, before conv_seg.
num_classes (int): Number of classes.
dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
conv_cfg (dict|None): Config of conv layers. Default: None.
norm_cfg (dict|None): Config of norm layers. Default: None.
act_cfg (dict): Config of activation layers.
Default: dict(type='ReLU')
in_index (int|Sequence[int]): Input feature index. Default: -1
input_transform (str|None): Transformation type of input features.
Options: 'resize_concat', 'multiple_select', None.
'resize_concat': Multiple feature maps will be resize to the
same size as first one and than concat together.
Usually used in FCN head of HRNet.
'multiple_select': Multiple feature maps will be bundle into
a list and passed into decode head.
None: Only one select feature map is allowed.
Default: None.
loss_decode (dict | Sequence[dict]): Config of decode loss.
The `loss_name` is property of corresponding loss function which
could be shown in training log. If you want this loss
item to be included into the backward graph, `loss_` must be the
prefix of the name. Defaults to 'loss_ce'.
e.g. dict(type='CrossEntropyLoss'),
[dict(type='CrossEntropyLoss', loss_name='loss_ce'),
dict(type='DiceLoss', loss_name='loss_dice')]
Default: dict(type='CrossEntropyLoss').
ignore_index (int | None): The label index to be ignored. When using
masked BCE loss, ignore_index should be set to None. Default: 255.
sampler (dict|None): The config of segmentation map sampler.
Default: None.
align_corners (bool): align_corners argument of F.interpolate.
Default: False.
init_cfg (dict or list[dict], optional): Initialization config dict.
"""

def __init__(self,
in_channels,
channels,
*,
num_classes,
dropout_ratio=0.1,
conv_cfg=None,
norm_cfg=None,
act_cfg=dict(type='ReLU'),
in_index=-1,
input_transform=None,
loss_decode=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
ignore_index=255,
sampler=None,
align_corners=False,
init_cfg=dict(
type='Normal', std=0.01, override=dict(name='conv_seg'))):
super(BaseDecodeHead, self).__init__(init_cfg)
self._init_inputs(in_channels, in_index, input_transform)
self.channels = channels
self.num_classes = num_classes
self.dropout_ratio = dropout_ratio
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.in_index = in_index

self.ignore_index = ignore_index
self.align_corners = align_corners

if isinstance(loss_decode, dict):
self.loss_decode = build_loss(loss_decode)
elif isinstance(loss_decode, (list, tuple)):
self.loss_decode = nn.ModuleList()
for loss in loss_decode:
self.loss_decode.append(build_loss(loss))
else:
raise TypeError(f'loss_decode must be a dict or sequence of dict,\
but got {type(loss_decode)}')

if sampler is not None:
self.sampler = build_pixel_sampler(sampler, context=self)
else:
self.sampler = None

self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
if dropout_ratio > 0:
self.dropout = nn.Dropout2d(dropout_ratio)
else:
self.dropout = None
self.fp16_enabled = False

def extra_repr(self):
"""Extra repr."""
s = f'input_transform={self.input_transform}, ' \
f'ignore_index={self.ignore_index}, ' \
f'align_corners={self.align_corners}'
return s

def _init_inputs(self, in_channels, in_index, input_transform):
"""Check and initialize input transforms.

The in_channels, in_index and input_transform must match.
Specifically, when input_transform is None, only single feature map
will be selected. So in_channels and in_index must be of type int.
When input_transform

Args:
in_channels (int|Sequence[int]): Input channels.
in_index (int|Sequence[int]): Input feature index.
input_transform (str|None): Transformation type of input features.
Options: 'resize_concat', 'multiple_select', None.
'resize_concat': Multiple feature maps will be resize to the
same size as first one and than concat together.
Usually used in FCN head of HRNet.
'multiple_select': Multiple feature maps will be bundle into
a list and passed into decode head.
None: Only one select feature map is allowed.
"""

if input_transform is not None:
assert input_transform in ['resize_concat', 'multiple_select']
self.input_transform = input_transform
self.in_index = in_index
if input_transform is not None:
assert isinstance(in_channels, (list, tuple))
assert isinstance(in_index, (list, tuple))
assert len(in_channels) == len(in_index)
if input_transform == 'resize_concat':
self.in_channels = sum(in_channels)
else:
self.in_channels = in_channels
else:
assert isinstance(in_channels, int)
assert isinstance(in_index, int)
self.in_channels = in_channels

def _transform_inputs(self, inputs):
"""Transform inputs for decoder.

Args:
inputs (list[Tensor]): List of multi-level img features.

Returns:
Tensor: The transformed inputs
"""

if self.input_transform == 'resize_concat':
inputs = [inputs[i] for i in self.in_index]
upsampled_inputs = [
seg_resize(
input=x,
size=inputs[0].shape[2:],
mode='bilinear',
align_corners=self.align_corners) for x in inputs
]
inputs = torch.cat(upsampled_inputs, dim=1)
elif self.input_transform == 'multiple_select':
inputs = [inputs[i] for i in self.in_index]
else:
inputs = inputs[self.in_index]

return inputs

@auto_fp16()
@abstractmethod
def forward(self, inputs):
"""Placeholder of forward function."""
pass

def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
"""Forward function for training.
Args:
inputs (list[Tensor]): List of multi-level img features.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
`mmseg/datasets/pipelines/formatting.py:Collect`.
gt_semantic_seg (Tensor): Semantic segmentation masks
used if the architecture supports semantic segmentation task.
train_cfg (dict): The training config.

Returns:
dict[str, Tensor]: a dictionary of loss components
"""
seg_logits = self.forward(inputs)
losses = self.losses(seg_logits, gt_semantic_seg)
return losses

def forward_test(self, inputs, img_metas, test_cfg):
"""Forward function for testing.

Args:
inputs (list[Tensor]): List of multi-level img features.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
`mmseg/datasets/pipelines/formatting.py:Collect`.
test_cfg (dict): The testing config.

Returns:
Tensor: Output segmentation map.
"""
return self.forward(inputs)

def cls_seg(self, feat):
"""Classify each pixel."""
if self.dropout is not None:
feat = self.dropout(feat)
output = self.conv_seg(feat)
return output

@force_fp32(apply_to=('seg_logit', ))
def losses(self, seg_logit, seg_label):
"""Compute segmentation loss."""
loss = dict()
seg_logit = seg_resize(
input=seg_logit,
size=seg_label.shape[2:],
mode='bilinear',
align_corners=self.align_corners)
if self.sampler is not None:
seg_weight = self.sampler.sample(seg_logit, seg_label)
else:
seg_weight = None
seg_label = seg_label.squeeze(1)

if not isinstance(self.loss_decode, nn.ModuleList):
losses_decode = [self.loss_decode]
else:
losses_decode = self.loss_decode
for loss_decode in losses_decode:
if loss_decode.loss_name not in loss:
loss[loss_decode.loss_name] = loss_decode(
seg_logit,
seg_label,
weight=seg_weight,
ignore_index=self.ignore_index)
else:
loss[loss_decode.loss_name] += loss_decode(
seg_logit,
seg_label,
weight=seg_weight,
ignore_index=self.ignore_index)

loss['acc_seg'] = accuracy(
seg_logit, seg_label, ignore_index=self.ignore_index)
return loss

+ 581
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/decode_heads/mask2former_head_from_mmseg.py View File

@@ -0,0 +1,581 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git

import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init
from mmcv.cnn.bricks.transformer import (build_positional_encoding,
build_transformer_layer_sequence)
from mmcv.ops import point_sample
from mmcv.runner import ModuleList, force_fp32
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean
from mmdet.models.builder import HEADS, build_loss
from mmdet.models.utils import get_uncertain_point_coords_with_randomness

from .base_decode_head import BaseDecodeHead


@HEADS.register_module()
class Mask2FormerHeadFromMMSeg(BaseDecodeHead):
"""Implements the Mask2Former head.

See `Masked-attention Mask Transformer for Universal Image
Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.

Args:
in_channels (list[int]): Number of channels in the input feature map.
feat_channels (int): Number of channels for features.
out_channels (int): Number of channels for output.
num_things_classes (int): Number of things.
num_stuff_classes (int): Number of stuff.
num_queries (int): Number of query in Transformer decoder.
pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel
decoder. Defaults to None.
enforce_decoder_input_project (bool, optional): Whether to add
a layer to change the embed_dim of tranformer encoder in
pixel decoder to the embed_dim of transformer decoder.
Defaults to False.
transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for
transformer decoder. Defaults to None.
positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for
transformer decoder position encoding. Defaults to None.
loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification
loss. Defaults to None.
loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss.
Defaults to None.
loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss.
Defaults to None.
train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of
Mask2Former head.
test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of
Mask2Former head.
init_cfg (dict or list[dict], optional): Initialization config dict.
Defaults to None.
"""

def __init__(self,
in_channels,
feat_channels,
out_channels,
num_things_classes=80,
num_stuff_classes=53,
num_queries=100,
num_transformer_feat_level=3,
pixel_decoder=None,
enforce_decoder_input_project=False,
transformer_decoder=None,
positional_encoding=None,
loss_cls=None,
loss_mask=None,
loss_dice=None,
train_cfg=None,
test_cfg=None,
init_cfg=None,
**kwargs):
super(Mask2FormerHeadFromMMSeg, self).__init__(
in_channels=in_channels,
channels=feat_channels,
num_classes=(num_things_classes + num_stuff_classes),
init_cfg=init_cfg,
input_transform='multiple_select',
**kwargs)
self.num_things_classes = num_things_classes
self.num_stuff_classes = num_stuff_classes
self.num_classes = self.num_things_classes + self.num_stuff_classes
self.num_queries = num_queries
self.num_transformer_feat_level = num_transformer_feat_level
self.num_heads = transformer_decoder.transformerlayers. \
attn_cfgs.num_heads
self.num_transformer_decoder_layers = transformer_decoder.num_layers
assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level
pixel_decoder_ = copy.deepcopy(pixel_decoder)
pixel_decoder_.update(
in_channels=in_channels,
feat_channels=feat_channels,
out_channels=out_channels)
self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1]
self.transformer_decoder = build_transformer_layer_sequence(
transformer_decoder)
self.decoder_embed_dims = self.transformer_decoder.embed_dims

self.decoder_input_projs = ModuleList()
# from low resolution to high resolution
for _ in range(num_transformer_feat_level):
if (self.decoder_embed_dims != feat_channels
or enforce_decoder_input_project):
self.decoder_input_projs.append(
Conv2d(
feat_channels, self.decoder_embed_dims, kernel_size=1))
else:
self.decoder_input_projs.append(nn.Identity())
self.decoder_positional_encoding = build_positional_encoding(
positional_encoding)
self.query_embed = nn.Embedding(self.num_queries, feat_channels)
self.query_feat = nn.Embedding(self.num_queries, feat_channels)
# from low resolution to high resolution
self.level_embed = nn.Embedding(self.num_transformer_feat_level,
feat_channels)

self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
self.mask_embed = nn.Sequential(
nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
nn.Linear(feat_channels, out_channels))
self.conv_seg = None # fix a bug here (conv_seg is not used)

self.test_cfg = test_cfg
self.train_cfg = train_cfg
if train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
self.sampler = build_sampler(self.train_cfg.sampler, context=self)
self.num_points = self.train_cfg.get('num_points', 12544)
self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
self.importance_sample_ratio = self.train_cfg.get(
'importance_sample_ratio', 0.75)

self.class_weight = loss_cls.class_weight
self.loss_cls = build_loss(loss_cls)
self.loss_mask = build_loss(loss_mask)
self.loss_dice = build_loss(loss_dice)

def init_weights(self):
for m in self.decoder_input_projs:
if isinstance(m, Conv2d):
caffe2_xavier_init(m, bias=0)

self.pixel_decoder.init_weights()

for p in self.transformer_decoder.parameters():
if p.dim() > 1:
nn.init.xavier_normal_(p)

def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list,
gt_masks_list, img_metas):
"""Compute classification and mask targets for all images for a decoder
layer.

Args:
cls_scores_list (list[Tensor]): Mask score logits from a single
decoder layer for all images. Each with shape [num_queries,
cls_out_channels].
mask_preds_list (list[Tensor]): Mask logits from a single decoder
layer for all images. Each with shape [num_queries, h, w].
gt_labels_list (list[Tensor]): Ground truth class indices for all
images. Each with shape (n, ), n is the sum of number of stuff
type and number of instance in a image.
gt_masks_list (list[Tensor]): Ground truth mask for each image,
each with shape (n, h, w).
img_metas (list[dict]): List of image meta information.

Returns:
tuple[list[Tensor]]: a tuple containing the following targets.

- labels_list (list[Tensor]): Labels of all images.
Each with shape [num_queries, ].
- label_weights_list (list[Tensor]): Label weights of all
images.Each with shape [num_queries, ].
- mask_targets_list (list[Tensor]): Mask targets of all images.
Each with shape [num_queries, h, w].
- mask_weights_list (list[Tensor]): Mask weights of all images.
Each with shape [num_queries, ].
- num_total_pos (int): Number of positive samples in all
images.
- num_total_neg (int): Number of negative samples in all
images.
"""
(labels_list, label_weights_list, mask_targets_list, mask_weights_list,
pos_inds_list,
neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list,
mask_preds_list, gt_labels_list,
gt_masks_list, img_metas)

num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, mask_targets_list,
mask_weights_list, num_total_pos, num_total_neg)

def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks,
img_metas):
"""Compute classification and mask targets for one image.

Args:
cls_score (Tensor): Mask score logits from a single decoder layer
for one image. Shape (num_queries, cls_out_channels).
mask_pred (Tensor): Mask logits for a single decoder layer for one
image. Shape (num_queries, h, w).
gt_labels (Tensor): Ground truth class indices for one image with
shape (num_gts, ).
gt_masks (Tensor): Ground truth mask for each image, each with
shape (num_gts, h, w).
img_metas (dict): Image informtation.

Returns:
tuple[Tensor]: A tuple containing the following for one image.

- labels (Tensor): Labels of each image. \
shape (num_queries, ).
- label_weights (Tensor): Label weights of each image. \
shape (num_queries, ).
- mask_targets (Tensor): Mask targets of each image. \
shape (num_queries, h, w).
- mask_weights (Tensor): Mask weights of each image. \
shape (num_queries, ).
- pos_inds (Tensor): Sampled positive indices for each \
image.
- neg_inds (Tensor): Sampled negative indices for each \
image.
"""
# sample points
num_queries = cls_score.shape[0]
num_gts = gt_labels.shape[0]

point_coords = torch.rand((1, self.num_points, 2),
device=cls_score.device)
# shape (num_queries, num_points)
mask_points_pred = point_sample(
mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
1)).squeeze(1)
# shape (num_gts, num_points)
gt_points_masks = point_sample(
gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
1)).squeeze(1)

# assign and sample
assign_result = self.assigner.assign(cls_score, mask_points_pred,
gt_labels, gt_points_masks,
img_metas)
sampling_result = self.sampler.sample(assign_result, mask_pred,
gt_masks)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds

# label target
labels = gt_labels.new_full((self.num_queries, ),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
label_weights = gt_labels.new_ones((self.num_queries, ))

# mask target
mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
mask_weights = mask_pred.new_zeros((self.num_queries, ))
mask_weights[pos_inds] = 1.0

return (labels, label_weights, mask_targets, mask_weights, pos_inds,
neg_inds)

def loss_single(self, cls_scores, mask_preds, gt_labels_list,
gt_masks_list, img_metas):
"""Loss function for outputs from a single decoder layer.

Args:
cls_scores (Tensor): Mask score logits from a single decoder layer
for all images. Shape (batch_size, num_queries,
cls_out_channels). Note `cls_out_channels` should includes
background.
mask_preds (Tensor): Mask logits for a pixel decoder for all
images. Shape (batch_size, num_queries, h, w).
gt_labels_list (list[Tensor]): Ground truth class indices for each
image, each with shape (num_gts, ).
gt_masks_list (list[Tensor]): Ground truth mask for each image,
each with shape (num_gts, h, w).
img_metas (list[dict]): List of image meta information.

Returns:
tuple[Tensor]: Loss components for outputs from a single \
decoder layer.
"""
num_imgs = cls_scores.size(0)
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
(labels_list, label_weights_list, mask_targets_list, mask_weights_list,
num_total_pos,
num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list,
gt_labels_list, gt_masks_list,
img_metas)
# shape (batch_size, num_queries)
labels = torch.stack(labels_list, dim=0)
# shape (batch_size, num_queries)
label_weights = torch.stack(label_weights_list, dim=0)
# shape (num_total_gts, h, w)
mask_targets = torch.cat(mask_targets_list, dim=0)
# shape (batch_size, num_queries)
mask_weights = torch.stack(mask_weights_list, dim=0)

# classfication loss
# shape (batch_size * num_queries, )
cls_scores = cls_scores.flatten(0, 1)
labels = labels.flatten(0, 1)
label_weights = label_weights.flatten(0, 1)

class_weight = cls_scores.new_tensor(self.class_weight)
loss_cls = self.loss_cls(
cls_scores,
labels,
label_weights,
avg_factor=class_weight[labels].sum())

num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos]))
num_total_masks = max(num_total_masks, 1)

# extract positive ones
# shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
mask_preds = mask_preds[mask_weights > 0]

if mask_targets.shape[0] == 0:
# zero match
loss_dice = mask_preds.sum()
loss_mask = mask_preds.sum()
return loss_cls, loss_mask, loss_dice

with torch.no_grad():
points_coords = get_uncertain_point_coords_with_randomness(
mask_preds.unsqueeze(1), None, self.num_points,
self.oversample_ratio, self.importance_sample_ratio)
# shape (num_total_gts, h, w) -> (num_total_gts, num_points)
mask_point_targets = point_sample(
mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
# shape (num_queries, h, w) -> (num_queries, num_points)
mask_point_preds = point_sample(
mask_preds.unsqueeze(1), points_coords).squeeze(1)

# dice loss
loss_dice = self.loss_dice(
mask_point_preds, mask_point_targets, avg_factor=num_total_masks)

# mask loss
# shape (num_queries, num_points) -> (num_queries * num_points, )
mask_point_preds = mask_point_preds.reshape(-1, 1)
# shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
mask_point_targets = mask_point_targets.reshape(-1)
loss_mask = self.loss_mask(
mask_point_preds,
mask_point_targets,
avg_factor=num_total_masks * self.num_points)

return loss_cls, loss_mask, loss_dice

@force_fp32(apply_to=('all_cls_scores', 'all_mask_preds'))
def loss(self, all_cls_scores, all_mask_preds, gt_labels_list,
gt_masks_list, img_metas):
"""Loss function.

Args:
all_cls_scores (Tensor): Classification scores for all decoder
layers with shape [num_decoder, batch_size, num_queries,
cls_out_channels].
all_mask_preds (Tensor): Mask scores for all decoder layers with
shape [num_decoder, batch_size, num_queries, h, w].
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (n, ). n is the sum of number of stuff type
and number of instance in a image.
gt_masks_list (list[Tensor]): Ground truth mask for each image with
shape (n, h, w).
img_metas (list[dict]): List of image meta information.

Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
num_dec_layers = len(all_cls_scores)
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_mask, losses_dice = multi_apply(
self.loss_single, all_cls_scores, all_mask_preds,
all_gt_labels_list, all_gt_masks_list, img_metas_list)

loss_dict = dict()
# loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_mask'] = losses_mask[-1]
loss_dict['loss_dice'] = losses_dice[-1]
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_mask_i, loss_dice_i in zip(
losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
num_dec_layer += 1
return loss_dict

def forward_head(self, decoder_out, mask_feature, attn_mask_target_size):
"""Forward for head part which is called after every decoder layer.

Args:
decoder_out (Tensor): in shape (num_queries, batch_size, c).
mask_feature (Tensor): in shape (batch_size, c, h, w).
attn_mask_target_size (tuple[int, int]): target attention
mask size.

Returns:
tuple: A tuple contain three elements.

- cls_pred (Tensor): Classification scores in shape \
(batch_size, num_queries, cls_out_channels). \
Note `cls_out_channels` should includes background.
- mask_pred (Tensor): Mask scores in shape \
(batch_size, num_queries,h, w).
- attn_mask (Tensor): Attention mask in shape \
(batch_size * num_heads, num_queries, h, w).
"""
decoder_out = self.transformer_decoder.post_norm(decoder_out)
decoder_out = decoder_out.transpose(0, 1)
# shape (num_queries, batch_size, c)
cls_pred = self.cls_embed(decoder_out)
# shape (num_queries, batch_size, c)
mask_embed = self.mask_embed(decoder_out)
# shape (num_queries, batch_size, h, w)
mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
attn_mask = F.interpolate(
mask_pred,
attn_mask_target_size,
mode='bilinear',
align_corners=False)
# shape (num_queries, batch_size, h, w) ->
# (batch_size * num_head, num_queries, h, w)
attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
(1, self.num_heads, 1, 1)).flatten(0, 1)
attn_mask = attn_mask.sigmoid() < 0.5
attn_mask = attn_mask.detach()

return cls_pred, mask_pred, attn_mask

def forward(self, feats, img_metas):
"""Forward function.

Args:
feats (list[Tensor]): Multi scale Features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.

Returns:
tuple: A tuple contains two elements.

- cls_pred_list (list[Tensor)]: Classification logits \
for each decoder layer. Each is a 3D-tensor with shape \
(batch_size, num_queries, cls_out_channels). \
Note `cls_out_channels` should includes background.
- mask_pred_list (list[Tensor]): Mask logits for each \
decoder layer. Each with shape (batch_size, num_queries, \
h, w).
"""
batch_size = len(img_metas)
mask_features, multi_scale_memorys = self.pixel_decoder(feats)
# multi_scale_memorys (from low resolution to high resolution)
decoder_inputs = []
decoder_positional_encodings = []
for i in range(self.num_transformer_feat_level):
decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
# shape (batch_size, c, h, w) -> (h*w, batch_size, c)
decoder_input = decoder_input.flatten(2).permute(2, 0, 1)
level_embed = self.level_embed.weight[i].view(1, 1, -1)
decoder_input = decoder_input + level_embed
# shape (batch_size, c, h, w) -> (h*w, batch_size, c)
mask = decoder_input.new_zeros(
(batch_size, ) + multi_scale_memorys[i].shape[-2:],
dtype=torch.bool)
decoder_positional_encoding = self.decoder_positional_encoding(
mask)
decoder_positional_encoding = decoder_positional_encoding.flatten(
2).permute(2, 0, 1)
decoder_inputs.append(decoder_input)
decoder_positional_encodings.append(decoder_positional_encoding)
# shape (num_queries, c) -> (num_queries, batch_size, c)
query_feat = self.query_feat.weight.unsqueeze(1).repeat(
(1, batch_size, 1))
query_embed = self.query_embed.weight.unsqueeze(1).repeat(
(1, batch_size, 1))

cls_pred_list = []
mask_pred_list = []
cls_pred, mask_pred, attn_mask = self.forward_head(
query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
cls_pred_list.append(cls_pred)
mask_pred_list.append(mask_pred)

for i in range(self.num_transformer_decoder_layers):
level_idx = i % self.num_transformer_feat_level
# if a mask is all True(all background), then set it all False.
attn_mask[torch.where(
attn_mask.sum(-1) == attn_mask.shape[-1])] = False

# cross_attn + self_attn
layer = self.transformer_decoder.layers[i]
attn_masks = [attn_mask, None]
query_feat = layer(
query=query_feat,
key=decoder_inputs[level_idx],
value=decoder_inputs[level_idx],
query_pos=query_embed,
key_pos=decoder_positional_encodings[level_idx],
attn_masks=attn_masks,
query_key_padding_mask=None,
# here we do not apply masking on padded region
key_padding_mask=None)
cls_pred, mask_pred, attn_mask = self.forward_head(
query_feat, mask_features, multi_scale_memorys[
(i + 1) % self.num_transformer_feat_level].shape[-2:])

cls_pred_list.append(cls_pred)
mask_pred_list.append(mask_pred)

return cls_pred_list, mask_pred_list

def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels,
gt_masks):
"""Forward function for training mode.

Args:
x (list[Tensor]): Multi-level features from the upstream network,
each is a 4D-tensor.
img_metas (list[Dict]): List of image information.
gt_semantic_seg (list[tensor]):Each element is the ground truth
of semantic segmentation with the shape (N, H, W).
train_cfg (dict): The training config, which not been used in
maskformer.
gt_labels (list[Tensor]): Each element is ground truth labels of
each box, shape (num_gts,).
gt_masks (list[BitmapMasks]): Each element is masks of instances
of a image, shape (num_gts, h, w).

Returns:
losses (dict[str, Tensor]): a dictionary of loss components
"""

# forward
all_cls_scores, all_mask_preds = self(x, img_metas)

# loss
losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks,
img_metas)

return losses

def forward_test(self, inputs, img_metas, test_cfg):
"""Test segment without test-time aumengtation.

Only the output of last decoder layers was used.

Args:
inputs (list[Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.
test_cfg (dict): Testing config.

Returns:
seg_mask (Tensor): Predicted semantic segmentation logits.
"""
all_cls_scores, all_mask_preds = self(inputs, img_metas)
cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1]
ori_h, ori_w, _ = img_metas[0]['ori_shape']

# semantic inference
cls_score = F.softmax(cls_score, dim=-1)[..., :-1]
mask_pred = mask_pred.sigmoid()
seg_mask = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
return seg_mask

+ 3
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/__init__.py View File

@@ -0,0 +1,3 @@
from .encoder_decoder_mask2former import EncoderDecoderMask2Former

__all__ = ['EncoderDecoderMask2Former']

+ 314
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/base_segmentor.py View File

@@ -0,0 +1,314 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git
import warnings
from abc import ABCMeta, abstractmethod
from collections import OrderedDict

import mmcv
import numpy as np
import torch
import torch.distributed as dist
from mmcv.runner import BaseModule, auto_fp16


class BaseSegmentor(BaseModule, metaclass=ABCMeta):
"""Base class for segmentors."""

def __init__(self, init_cfg=None):
super(BaseSegmentor, self).__init__(init_cfg)
self.fp16_enabled = False

@property
def with_neck(self):
"""bool: whether the segmentor has neck"""
return hasattr(self, 'neck') and self.neck is not None

@property
def with_auxiliary_head(self):
"""bool: whether the segmentor has auxiliary head"""
return hasattr(self,
'auxiliary_head') and self.auxiliary_head is not None

@property
def with_decode_head(self):
"""bool: whether the segmentor has decode head"""
return hasattr(self, 'decode_head') and self.decode_head is not None

@abstractmethod
def extract_feat(self, imgs):
"""Placeholder for extract features from images."""
pass

@abstractmethod
def encode_decode(self, img, img_metas):
"""Placeholder for encode images with backbone and decode into a
semantic segmentation map of the same size as input."""
pass

@abstractmethod
def forward_train(self, imgs, img_metas, **kwargs):
"""Placeholder for Forward function for training."""
pass

@abstractmethod
def simple_test(self, img, img_meta, **kwargs):
"""Placeholder for single image test."""
pass

@abstractmethod
def aug_test(self, imgs, img_metas, **kwargs):
"""Placeholder for augmentation test."""
pass

def forward_test(self, imgs, img_metas, **kwargs):
"""
Args:
imgs (List[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains all images in the batch.
img_metas (List[List[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch.
"""
for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError(f'{name} must be a list, but got '
f'{type(var)}')

num_augs = len(imgs)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(imgs)}) != '
f'num of image meta ({len(img_metas)})')

# all images in the same aug batch all of the same ori_shape and pad
# shape
def tensor_to_tuple(input_tensor):
return tuple(input_tensor.cpu().numpy())

for img_meta in img_metas:
ori_shapes = [_['ori_shape'] for _ in img_meta]
if isinstance(ori_shapes[0], torch.Tensor):
assert all(
tensor_to_tuple(shape) == tensor_to_tuple(ori_shapes[0])
for shape in ori_shapes)
else:
assert all(shape == ori_shapes[0] for shape in ori_shapes)

img_shapes = [_['img_shape'] for _ in img_meta]
if isinstance(img_shapes[0], torch.Tensor):
assert all(
tensor_to_tuple(shape) == tensor_to_tuple(img_shapes[0])
for shape in img_shapes)
else:
assert all(shape == img_shapes[0] for shape in img_shapes)

pad_shapes = [_['pad_shape'] for _ in img_meta]
if isinstance(pad_shapes[0], torch.Tensor):
assert all(
tensor_to_tuple(shape) == tensor_to_tuple(pad_shapes[0])
for shape in pad_shapes)
else:
assert all(shape == pad_shapes[0] for shape in pad_shapes)

if num_augs == 1:
return self.simple_test(imgs[0], img_metas[0], **kwargs)
else:
return self.aug_test(imgs, img_metas, **kwargs)

@auto_fp16(apply_to=('img', ))
def forward(self, img, img_metas, return_loss=True, **kwargs):
"""Calls either :func:`forward_train` or :func:`forward_test` depending
on whether ``return_loss`` is ``True``.

Note this setting will change the expected inputs. When
``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
and List[dict]), and when ``resturn_loss=False``, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
"""
if return_loss:
return self.forward_train(img, img_metas, **kwargs)
else:
return self.forward_test(img, img_metas, **kwargs)

def train_step(self, data_batch, optimizer, **kwargs):
"""The iteration step during training.

This method defines an iteration step during training, except for the
back propagation and optimizer updating, which are done in an optimizer
hook. Note that in some complicated cases or models, the whole process
including back propagation and optimizer updating is also defined in
this method, such as GAN.

Args:
data (dict): The output of dataloader.
optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
runner is passed to ``train_step()``. This argument is unused
and reserved.

Returns:
dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
``num_samples``.
``loss`` is a tensor for back propagation, which can be a
weighted sum of multiple losses.
``log_vars`` contains all the variables to be sent to the
logger.
``num_samples`` indicates the batch size (when the model is
DDP, it means the batch size on each GPU), which is used for
averaging the logs.
"""
losses = self(**data_batch)
loss, log_vars = self._parse_losses(losses)

outputs = dict(
loss=loss,
log_vars=log_vars,
num_samples=len(data_batch['img_metas']))

return outputs

def val_step(self, data_batch, optimizer=None, **kwargs):
"""The iteration step during validation.

This method shares the same signature as :func:`train_step`, but used
during val epochs. Note that the evaluation after training epochs is
not implemented with this method, but an evaluation hook.
"""
losses = self(**data_batch)
loss, log_vars = self._parse_losses(losses)

log_vars_ = dict()
for loss_name, loss_value in log_vars.items():
k = loss_name + '_val'
log_vars_[k] = loss_value

outputs = dict(
loss=loss,
log_vars=log_vars_,
num_samples=len(data_batch['img_metas']))

return outputs

@staticmethod
def _parse_losses(losses):
"""Parse the raw outputs (losses) of the network.

Args:
losses (dict): Raw output of the network, which usually contain
losses and other necessary information.

Returns:
tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
which may be a weighted sum of all losses, log_vars contains
all the variables to be sent to the logger.
"""
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(
f'{loss_name} is not a tensor or list of tensors')

loss = sum(_value for _key, _value in log_vars.items()
if 'loss' in _key)

# If the loss_vars has different length, raise assertion error
# to prevent GPUs from infinite waiting.
if dist.is_available() and dist.is_initialized():
log_var_length = torch.tensor(len(log_vars), device=loss.device)
dist.all_reduce(log_var_length)
message = (f'rank {dist.get_rank()}'
+ f' len(log_vars): {len(log_vars)}' + ' keys: '
+ ','.join(log_vars.keys()) + '\n')
assert log_var_length == len(log_vars) * dist.get_world_size(), \
'loss log variables are different across GPUs!\n' + message

log_vars['loss'] = loss
for loss_name, loss_value in log_vars.items():
# reduce loss when distributed training
if dist.is_available() and dist.is_initialized():
loss_value = loss_value.data.clone()
dist.all_reduce(loss_value.div_(dist.get_world_size()))
log_vars[loss_name] = loss_value.item()

return loss, log_vars

def show_result(self,
img,
result,
palette=None,
win_name='',
show=False,
wait_time=0,
out_file=None,
opacity=0.5):
"""Draw `result` over `img`.

Args:
img (str or Tensor): The image to be displayed.
result (Tensor): The semantic segmentation results to draw over
`img`.
palette (list[list[int]]] | np.ndarray | None): The palette of
segmentation map. If None is given, random palette will be
generated. Default: None
win_name (str): The window name.
wait_time (int): Value of waitKey param.
Default: 0.
show (bool): Whether to show the image.
Default: False.
out_file (str or None): The filename to write the image.
Default: None.
opacity(float): Opacity of painted segmentation map.
Default 0.5.
Must be in (0, 1] range.
Returns:
img (Tensor): Only if not `show` or `out_file`
"""
img = mmcv.imread(img)
img = img.copy()
seg = result[0]
if palette is None:
if self.PALETTE is None:
# Get random state before set seed,
# and restore random state later.
# It will prevent loss of randomness, as the palette
# may be different in each iteration if not specified.
# See: https://github.com/open-mmlab/mmdetection/issues/5844
state = np.random.get_state()
np.random.seed(42)
# random palette
palette = np.random.randint(
0, 255, size=(len(self.CLASSES), 3))
np.random.set_state(state)
else:
palette = self.PALETTE
palette = np.array(palette)
assert palette.shape[0] == len(self.CLASSES)
assert palette.shape[1] == 3
assert len(palette.shape) == 2
assert 0 < opacity <= 1.0
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
for label, color in enumerate(palette):
color_seg[seg == label, :] = color
# convert to BGR
color_seg = color_seg[..., ::-1]

img = img * (1 - opacity) + color_seg * opacity
img = img.astype(np.uint8)
# if out_file specified, do not show image in window
if out_file is not None:
show = False

if show:
mmcv.imshow(img, win_name, wait_time)
if out_file is not None:
mmcv.imwrite(img, out_file)

if not (show or out_file):
warnings.warn('show==False and out_file is not specified, only '
'result image will be returned')
return img

+ 303
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/models/segmentors/encoder_decoder_mask2former.py View File

@@ -0,0 +1,303 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.models import builder
from mmdet.models.builder import DETECTORS

from ...utils import add_prefix, seg_resize
from .base_segmentor import BaseSegmentor


@DETECTORS.register_module()
class EncoderDecoderMask2Former(BaseSegmentor):
"""Encoder Decoder segmentors.

EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
Note that auxiliary_head is only used for deep supervision during training,
which could be dumped during inference.
"""

def __init__(self,
backbone,
decode_head,
neck=None,
auxiliary_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
init_cfg=None):
super(EncoderDecoderMask2Former, self).__init__(init_cfg)
if pretrained is not None:
assert backbone.get('pretrained') is None, \
'both backbone and segmentor set pretrained weight'
backbone.pretrained = pretrained
self.backbone = builder.build_backbone(backbone)
if neck is not None:
self.neck = builder.build_neck(neck)
decode_head.update(train_cfg=train_cfg)
decode_head.update(test_cfg=test_cfg)
self._init_decode_head(decode_head)
self._init_auxiliary_head(auxiliary_head)

self.train_cfg = train_cfg
self.test_cfg = test_cfg

assert self.with_decode_head

def _init_decode_head(self, decode_head):
"""Initialize ``decode_head``"""
self.decode_head = builder.build_head(decode_head)
self.align_corners = self.decode_head.align_corners
self.num_classes = self.decode_head.num_classes

def _init_auxiliary_head(self, auxiliary_head):
"""Initialize ``auxiliary_head``"""
if auxiliary_head is not None:
if isinstance(auxiliary_head, list):
self.auxiliary_head = nn.ModuleList()
for head_cfg in auxiliary_head:
self.auxiliary_head.append(builder.build_head(head_cfg))
else:
self.auxiliary_head = builder.build_head(auxiliary_head)

def extract_feat(self, img):
"""Extract features from images."""
x = self.backbone(img)
if self.with_neck:
x = self.neck(x)
return x

def encode_decode(self, img, img_metas):
"""Encode images with backbone and decode into a semantic segmentation
map of the same size as input."""
x = self.extract_feat(img)
out = self._decode_head_forward_test(x, img_metas)
out = seg_resize(
input=out,
size=img.shape[2:],
mode='bilinear',
align_corners=self.align_corners)
return out

def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg,
**kwargs):
"""Run forward function and calculate loss for decode head in
training."""
losses = dict()
loss_decode = self.decode_head.forward_train(x, img_metas,
gt_semantic_seg, **kwargs)

losses.update(add_prefix(loss_decode, 'decode'))
return losses

def _decode_head_forward_test(self, x, img_metas):
"""Run forward function and calculate loss for decode head in
inference."""
seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
return seg_logits

def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
"""Run forward function and calculate loss for auxiliary head in
training."""
losses = dict()
if isinstance(self.auxiliary_head, nn.ModuleList):
for idx, aux_head in enumerate(self.auxiliary_head):
loss_aux = aux_head.forward_train(x, img_metas,
gt_semantic_seg,
self.train_cfg)
losses.update(add_prefix(loss_aux, f'aux_{idx}'))
else:
loss_aux = self.auxiliary_head.forward_train(
x, img_metas, gt_semantic_seg, self.train_cfg)
losses.update(add_prefix(loss_aux, 'aux'))

return losses

def forward_dummy(self, img):
"""Dummy forward function."""
seg_logit = self.encode_decode(img, None)

return seg_logit

def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs):
"""Forward function for training.

Args:
img (Tensor): Input images.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
`mmseg/datasets/pipelines/formatting.py:Collect`.
gt_semantic_seg (Tensor): Semantic segmentation masks
used if the architecture supports semantic segmentation task.

Returns:
dict[str, Tensor]: a dictionary of loss components
"""

x = self.extract_feat(img)

losses = dict()

loss_decode = self._decode_head_forward_train(x, img_metas,
gt_semantic_seg,
**kwargs)
losses.update(loss_decode)

if self.with_auxiliary_head:
loss_aux = self._auxiliary_head_forward_train(
x, img_metas, gt_semantic_seg)
losses.update(loss_aux)

return losses

# TODO refactor
def slide_inference(self, img, img_meta, rescale):
"""Inference by sliding-window with overlap.

If h_crop > h_img or w_crop > w_img, the small patch will be used to
decode without padding.
"""

h_stride, w_stride = self.test_cfg.stride
h_crop, w_crop = self.test_cfg.crop_size
batch_size, _, h_img, w_img = img.size()
num_classes = self.num_classes
h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
for h_idx in range(h_grids):
for w_idx in range(w_grids):
y1 = h_idx * h_stride
x1 = w_idx * w_stride
y2 = min(y1 + h_crop, h_img)
x2 = min(x1 + w_crop, w_img)
y1 = max(y2 - h_crop, 0)
x1 = max(x2 - w_crop, 0)
crop_img = img[:, :, y1:y2, x1:x2]
crop_seg_logit = self.encode_decode(crop_img, img_meta)
preds += F.pad(crop_seg_logit,
(int(x1), int(preds.shape[3] - x2), int(y1),
int(preds.shape[2] - y2)))

count_mat[:, :, y1:y2, x1:x2] += 1
assert (count_mat == 0).sum() == 0
if torch.onnx.is_in_onnx_export():
# cast count_mat to constant while exporting to ONNX
count_mat = torch.from_numpy(
count_mat.cpu().detach().numpy()).to(device=img.device)
preds = preds / count_mat

def tensor_to_tuple(input_tensor):
return tuple(input_tensor.cpu().numpy())

if rescale:
preds = seg_resize(
preds,
size=tensor_to_tuple(img_meta[0]['ori_shape'])[:2]
if isinstance(img_meta[0]['ori_shape'], torch.Tensor) else
img_meta[0]['ori_shape'],
mode='bilinear',
align_corners=self.align_corners,
warning=False)
return preds

def whole_inference(self, img, img_meta, rescale):
"""Inference with full image."""

seg_logit = self.encode_decode(img, img_meta)
if rescale:
# support dynamic shape for onnx
if torch.onnx.is_in_onnx_export():
size = img.shape[2:]
else:
size = img_meta[0]['ori_shape'][:2]
seg_logit = seg_resize(
seg_logit,
size=size,
mode='bilinear',
align_corners=self.align_corners,
warning=False)

return seg_logit

def inference(self, img, img_meta, rescale):
"""Inference with slide/whole style.

Args:
img (Tensor): The input image of shape (N, 3, H, W).
img_meta (dict): Image info dict where each dict has: 'img_shape',
'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
`mmseg/datasets/pipelines/formatting.py:Collect`.
rescale (bool): Whether rescale back to original shape.

Returns:
Tensor: The output segmentation map.
"""

assert self.test_cfg.mode in ['slide', 'whole']
ori_shape = img_meta[0]['ori_shape']

def tensor_to_tuple(input_tensor):
return tuple(input_tensor.cpu().numpy())

if isinstance(ori_shape, torch.Tensor):
assert all(
tensor_to_tuple(_['ori_shape']) == tensor_to_tuple(ori_shape)
for _ in img_meta)
else:
assert all(_['ori_shape'] == ori_shape for _ in img_meta)
if self.test_cfg.mode == 'slide':
seg_logit = self.slide_inference(img, img_meta, rescale)
else:
seg_logit = self.whole_inference(img, img_meta, rescale)
output = F.softmax(seg_logit, dim=1)
flip = img_meta[0]['flip']
if flip:
flip_direction = img_meta[0]['flip_direction']
assert flip_direction in ['horizontal', 'vertical']
if flip_direction == 'horizontal':
output = output.flip(dims=(3, ))
elif flip_direction == 'vertical':
output = output.flip(dims=(2, ))

return output

def simple_test(self, img, img_meta, rescale=True):
"""Simple test with single image."""
seg_logit = self.inference(img, img_meta, rescale)
seg_pred = seg_logit.argmax(dim=1)
if torch.onnx.is_in_onnx_export():
# our inference backend only support 4D output
seg_pred = seg_pred.unsqueeze(0)
return seg_pred
seg_pred = seg_pred.cpu().numpy()
# unravel batch dim
seg_pred = list(seg_pred)
return seg_pred

def aug_test(self, imgs, img_metas, rescale=True):
"""Test with augmentations.

Only rescale=True is supported.
"""
# aug_test rescale all imgs back to ori_shape for now
assert rescale
# to save memory, we get augmented seg logit inplace
seg_logit = self.inference(imgs[0], img_metas[0], rescale)
for i in range(1, len(imgs)):
cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
seg_logit += cur_seg_logit
seg_logit /= len(imgs)
seg_pred = seg_logit.argmax(dim=1)
seg_pred = seg_pred.cpu().numpy()
# unravel batch dim
seg_pred = list(seg_pred)
return seg_pred

+ 7
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/__init__.py View File

@@ -0,0 +1,7 @@
from .builder import build_pixel_sampler
from .data_process_func import ResizeToMultiple
from .seg_func import add_prefix, seg_resize

__all__ = [
'seg_resize', 'add_prefix', 'build_pixel_sampler', 'ResizeToMultiple'
]

+ 11
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/builder.py View File

@@ -0,0 +1,11 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git
from mmcv.utils import Registry, build_from_cfg

PIXEL_SAMPLERS = Registry('pixel sampler')


def build_pixel_sampler(cfg, **default_args):
"""Build pixel sampler for segmentation map."""
return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)

+ 60
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/data_process_func.py View File

@@ -0,0 +1,60 @@
# Copyright (c) OpenMMLab. All rights reserved.
import mmcv
from mmdet.datasets.builder import PIPELINES


@PIPELINES.register_module()
class ResizeToMultiple(object):
"""Resize images & seg to multiple of divisor.

Args:
size_divisor (int): images and gt seg maps need to resize to multiple
of size_divisor. Default: 32.
interpolation (str, optional): The interpolation mode of image resize.
Default: None
"""

def __init__(self, size_divisor=32, interpolation=None):
self.size_divisor = size_divisor
self.interpolation = interpolation

def __call__(self, results):
"""Call function to resize images, semantic segmentation map to
multiple of size divisor.

Args:
results (dict): Result dict from loading pipeline.

Returns:
dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
"""
# Align image to multiple of size divisor.
img = results['img']
img = mmcv.imresize_to_multiple(
img,
self.size_divisor,
scale_factor=1,
interpolation=self.interpolation
if self.interpolation else 'bilinear')

results['img'] = img
results['img_shape'] = img.shape
results['pad_shape'] = img.shape

# Align segmentation map to multiple of size divisor.
for key in results.get('seg_fields', []):
gt_seg = results[key]
gt_seg = mmcv.imresize_to_multiple(
gt_seg,
self.size_divisor,
scale_factor=1,
interpolation='nearest')
results[key] = gt_seg

return results

def __repr__(self):
repr_str = self.__class__.__name__
repr_str += (f'(size_divisor={self.size_divisor}, '
f'interpolation={self.interpolation})')
return repr_str

+ 48
- 0
modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py View File

@@ -0,0 +1,48 @@
# The implementation refers to the VitAdapter
# available at
# https://github.com/czczup/ViT-Adapter.git

import warnings

import torch.nn.functional as F


def seg_resize(input,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None,
warning=True):
if warning:
if size is not None and align_corners:
input_h, input_w = tuple(int(x) for x in input.shape[2:])
output_h, output_w = tuple(int(x) for x in size)
if output_h > input_h or output_w > input_w:
if ((output_h > 1 and output_w > 1 and input_h > 1
and input_w > 1) and (output_h - 1) % (input_h - 1)
and (output_w - 1) % (input_w - 1)):
warnings.warn(
f'When align_corners={align_corners}, '
'the output would more aligned if '
f'input size {(input_h, input_w)} is `x+1` and '
f'out size {(output_h, output_w)} is `nx+1`')
return F.interpolate(input, size, scale_factor, mode, align_corners)


def add_prefix(inputs, prefix):
"""Add prefix for dict.

Args:
inputs (dict): The input dict with str keys.
prefix (str): The prefix to add.

Returns:

dict: The dict with keys updated with ``prefix``.
"""

outputs = dict()
for name, value in inputs.items():
outputs[f'{prefix}.{name}'] = value

return outputs

+ 25
- 0
modelscope/models/cv/movie_scene_segmentation/__init__.py View File

@@ -0,0 +1,25 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:

from .model import MovieSceneSegmentationModel
from .datasets import MovieSceneSegmentationDataset

else:
_import_structure = {
'model': ['MovieSceneSegmentationModel'],
'datasets': ['MovieSceneSegmentationDataset'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 45
- 0
modelscope/models/cv/movie_scene_segmentation/get_model.py View File

@@ -0,0 +1,45 @@
# ------------------------------------------------------------------------------------
# BaSSL
# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# Github: https://github.com/kakaobrain/bassl
# ------------------------------------------------------------------------------------

from .utils.shot_encoder import resnet50
from .utils.trn import TransformerCRN


def get_shot_encoder(cfg):
name = cfg['model']['shot_encoder']['name']
shot_encoder_args = cfg['model']['shot_encoder'][name]
if name == 'resnet':
depth = shot_encoder_args['depth']
if depth == 50:
shot_encoder = resnet50(**shot_encoder_args['params'], )
else:
raise NotImplementedError
else:
raise NotImplementedError

return shot_encoder


def get_contextual_relation_network(cfg):
crn = None

if cfg['model']['contextual_relation_network']['enabled']:
name = cfg['model']['contextual_relation_network']['name']
crn_args = cfg['model']['contextual_relation_network']['params'][name]
if name == 'trn':
sampling_name = cfg['model']['loss']['sampling_method']['name']
crn_args['neighbor_size'] = (
2 * cfg['model']['loss']['sampling_method']['params']
[sampling_name]['neighbor_size'])
crn = TransformerCRN(crn_args)
else:
raise NotImplementedError

return crn


__all__ = ['get_shot_encoder', 'get_contextual_relation_network']

+ 192
- 0
modelscope/models/cv/movie_scene_segmentation/model.py View File

@@ -0,0 +1,192 @@
import os
import os.path as osp
from typing import Any, Dict

import einops
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as TF
from PIL import Image
from shotdetect_scenedetect_lgss import shot_detect

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from .get_model import get_contextual_relation_network, get_shot_encoder
from .utils.save_op import get_pred_boundary, pred2scene, scene2video

logger = get_logger()


@MODELS.register_module(
Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert)
class MovieSceneSegmentationModel(TorchModel):

def __init__(self, model_dir: str, *args, **kwargs):
"""str -- model file root."""
super().__init__(model_dir, *args, **kwargs)

model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
params = torch.load(model_path, map_location='cpu')

config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
self.cfg = Config.from_file(config_path)

def load_param_with_prefix(prefix, model, src_params):
own_state = model.state_dict()
for name, param in own_state.items():
src_name = prefix + '.' + name
own_state[name] = src_params[src_name]

model.load_state_dict(own_state)

self.shot_encoder = get_shot_encoder(self.cfg)
load_param_with_prefix('shot_encoder', self.shot_encoder, params)
self.crn = get_contextual_relation_network(self.cfg)
load_param_with_prefix('crn', self.crn, params)

crn_name = self.cfg.model.contextual_relation_network.name
hdim = self.cfg.model.contextual_relation_network.params[crn_name][
'hidden_size']
self.head_sbd = nn.Linear(hdim, 2)
load_param_with_prefix('head_sbd', self.head_sbd, params)

self.test_transform = TF.Compose([
TF.Resize(size=256, interpolation=Image.BICUBIC),
TF.CenterCrop(224),
TF.ToTensor(),
TF.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

self.infer_result = {'vid': [], 'sid': [], 'pred': []}
sampling_method = self.cfg.dataset.sampling_method.name
self.neighbor_size = self.cfg.dataset.sampling_method.params[
sampling_method].neighbor_size

self.eps = 1e-5

def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
data = inputs['video']
labels = inputs['label']
outputs = self.shared_step(data)

loss = F.cross_entropy(
outputs.squeeze(), labels.squeeze(), reduction='none')
lpos = labels == 1
lneg = labels == 0

pp, nn = 1, 1
wp = (pp / float(pp + nn)) * lpos / (lpos.sum() + self.eps)
wn = (nn / float(pp + nn)) * lneg / (lneg.sum() + self.eps)
w = wp + wn
loss = (w * loss).sum()

probs = torch.argmax(outputs, dim=1)

re = dict(pred=probs, loss=loss)
return re

def inference(self, batch):
logger.info('Begin scene detect ......')
bs = self.cfg.pipeline.batch_size_per_gpu
sids = batch['sid']
inputs = batch['shot_feat']

shot_num = len(sids)
cnt = shot_num // bs + 1

for i in range(cnt):
start = i * bs
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
input_ = inputs[start:end]
sid_ = sids[start:end]
input_ = torch.stack(input_)
outputs = self.shared_step(input_) # shape [b,2]
prob = F.softmax(outputs, dim=1)
self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
self.infer_result['pred'] = np.stack(self.infer_result['pred'])

assert len(self.infer_result['sid']) == len(sids)
assert len(self.infer_result['pred']) == len(inputs)
return self.infer_result

def shared_step(self, inputs):
with torch.no_grad():
# infer shot encoder
shot_repr = self.extract_shot_representation(inputs)
assert len(shot_repr.shape) == 3

# infer CRN
_, pooled = self.crn(shot_repr, mask=None)
# infer boundary score
pred = self.head_sbd(pooled)
return pred

def save_shot_feat(self, _repr):
feat = _repr.float().cpu().numpy()
pth = self.cfg.dataset.img_path + '/features'
os.makedirs(pth)

for idx in range(_repr.shape[0]):
name = f'shot_{str(idx).zfill(4)}.npy'
name = osp.join(pth, name)
np.save(name, feat[idx])

def extract_shot_representation(self,
inputs: torch.Tensor) -> torch.Tensor:
""" inputs [b s k c h w] -> output [b d] """
assert len(inputs.shape) == 6 # (B Shot Keyframe C H W)
b, s, k, c, h, w = inputs.shape
inputs = einops.rearrange(inputs, 'b s k c h w -> (b s) k c h w', s=s)
keyframe_repr = [self.shot_encoder(inputs[:, _k]) for _k in range(k)]
# [k (b s) d] -> [(b s) d]
shot_repr = torch.stack(keyframe_repr).mean(dim=0)

shot_repr = einops.rearrange(shot_repr, '(b s) d -> b s d', s=s)
return shot_repr

def postprocess(self, inputs: Dict[str, Any], **kwargs):
logger.info('Generate scene .......')

pred_dict = inputs['feat']
thres = self.cfg.pipeline.save_threshold

anno_dict = get_pred_boundary(pred_dict, thres)
scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict)
if self.cfg.pipeline.save_split_scene:
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
print(f'Split scene video saved to {re_dir}')
return len(scene_list), scene_dict

def preprocess(self, inputs):
logger.info('Begin shot detect......')
shot_keyf_lst, anno, shot2keyf = shot_detect(
inputs, **self.cfg.preprocessor.shot_detect)
logger.info('Shot detect done!')

single_shot_feat, sid = [], []
for idx, one_shot in enumerate(shot_keyf_lst):
one_shot = [
self.test_transform(one_frame) for one_frame in one_shot
]
one_shot = torch.stack(one_shot, dim=0)
single_shot_feat.append(one_shot)
sid.append(idx)
single_shot_feat = torch.stack(single_shot_feat, dim=0)
shot_feat = []
for idx, one_shot in enumerate(anno):
shot_idx = int(one_shot['shot_id']) + np.arange(
-self.neighbor_size, self.neighbor_size + 1)
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
_one_shot = single_shot_feat[shot_idx]
shot_feat.append(_one_shot)
self.shot2keyf = shot2keyf
self.anno = anno
return shot_feat, sid

+ 3
- 0
modelscope/models/cv/movie_scene_segmentation/utils/__init__.py View File

@@ -0,0 +1,3 @@
from .save_op import get_pred_boundary, pred2scene, scene2video
from .shot_encoder import resnet50
from .trn import TransformerCRN

+ 29
- 0
modelscope/models/cv/movie_scene_segmentation/utils/head.py View File

@@ -0,0 +1,29 @@
# ------------------------------------------------------------------------------------
# BaSSL
# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# Github: https://github.com/kakaobrain/bassl
# ------------------------------------------------------------------------------------

import torch.nn as nn
import torch.nn.functional as F


class MlpHead(nn.Module):

def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128):
super().__init__()
self.output_dim = output_dim
self.input_dim = input_dim
self.hidden_dim = hidden_dim

self.model = nn.Sequential(
nn.Linear(self.input_dim, self.hidden_dim, bias=True),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.output_dim, bias=True),
)

def forward(self, x):
# x shape: [b t d] where t means the number of views
x = self.model(x)
return F.normalize(x, dim=-1)

+ 118
- 0
modelscope/models/cv/movie_scene_segmentation/utils/save_op.py View File

@@ -0,0 +1,118 @@
# ----------------------------------------------------------------------------------
# The codes below partially refer to the SceneSeg LGSS.
# Github: https://github.com/AnyiRao/SceneSeg
# ----------------------------------------------------------------------------------
import os
import os.path as osp
import subprocess

import cv2
import numpy as np
from tqdm import tqdm


def get_pred_boundary(pred_dict, threshold=0.5):
pred = pred_dict['pred']
tmp = (pred > threshold).astype(np.int32)
anno_dict = {}
for idx in range(len(tmp)):
anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
return anno_dict


def pred2scene(shot2keyf, anno_dict):
scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)

scene_dict = {}
assert len(scene_list) == len(pair_list)
for scene_ind, scene_item in enumerate(scene_list):
scene_dict.update(
{scene_ind: {
'shot': pair_list[scene_ind],
'frame': scene_item
}})

return scene_dict, scene_list


def scene2video(source_movie_fn, scene_list, thres):

vcap = cv2.VideoCapture(source_movie_fn)
fps = vcap.get(cv2.CAP_PROP_FPS) # video.fps
out_video_dir_fn = os.path.join(os.getcwd(),
f'pred_result/scene_video_{thres}')
os.makedirs(out_video_dir_fn, exist_ok=True)

for scene_ind, scene_item in tqdm(enumerate(scene_list)):
scene = str(scene_ind).zfill(4)
start_frame = int(scene_item[0])
end_frame = int(scene_item[1])
start_time, end_time = start_frame / fps, end_frame / fps
duration_time = end_time - start_time
out_video_fn = os.path.join(out_video_dir_fn,
'scene_{}.mp4'.format(scene))
if os.path.exists(out_video_fn):
continue
call_list = ['ffmpeg']
call_list += ['-v', 'quiet']
call_list += [
'-y', '-ss',
str(start_time), '-t',
str(duration_time), '-i', source_movie_fn
]
call_list += ['-map_chapters', '-1']
call_list += [out_video_fn]
subprocess.call(call_list)
return osp.join(os.getcwd(), 'pred_result')


def get_demo_scene_list(shot2keyf, anno_dict):
pair_list = get_pair_list(anno_dict)

scene_list = []
for pair in pair_list:
start_shot, end_shot = int(pair[0]), int(pair[-1])
start_frame = shot2keyf[start_shot].split(' ')[0]
end_frame = shot2keyf[end_shot].split(' ')[1]
scene_list.append((start_frame, end_frame))
return scene_list, pair_list


def get_pair_list(anno_dict):
sort_anno_dict_key = sorted(anno_dict.keys())
tmp = 0
tmp_list = []
tmp_label_list = []
anno_list = []
anno_label_list = []
for key in sort_anno_dict_key:
value = anno_dict.get(key)
tmp += value
tmp_list.append(key)
tmp_label_list.append(value)
if tmp == 1:
anno_list.append(tmp_list)
anno_label_list.append(tmp_label_list)
tmp = 0
tmp_list = []
tmp_label_list = []
continue
if key == sort_anno_dict_key[-1]:
if len(tmp_list) > 0:
anno_list.append(tmp_list)
anno_label_list.append(tmp_label_list)
if len(anno_list) == 0:
return None
while [] in anno_list:
anno_list.remove([])
tmp_anno_list = [anno_list[0]]
pair_list = []
for ind in range(len(anno_list) - 1):
cont_count = int(anno_list[ind + 1][0]) - int(anno_list[ind][-1])
if cont_count > 1:
pair_list.extend(tmp_anno_list)
tmp_anno_list = [anno_list[ind + 1]]
continue
tmp_anno_list.append(anno_list[ind + 1])
pair_list.extend(tmp_anno_list)
return pair_list

+ 331
- 0
modelscope/models/cv/movie_scene_segmentation/utils/shot_encoder.py View File

@@ -0,0 +1,331 @@
"""
Modified from original implementation in torchvision
"""

from typing import Any, Callable, List, Optional, Type, Union

import torch
import torch.nn as nn
from torch import Tensor


def conv3x3(in_planes: int,
out_planes: int,
stride: int = 1,
groups: int = 1,
dilation: int = 1) -> nn.Conv2d:
"""3x3 convolution with padding"""
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
bias=False,
dilation=dilation,
)


def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
"""1x1 convolution"""
return nn.Conv2d(
in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
expansion: int = 1

def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None,
) -> None:
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError(
'BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError(
'Dilation > 1 not supported in BasicBlock')
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride

def forward(self, x: Tensor) -> Tensor:
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)

return out


class Bottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

expansion: int = 4

def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None,
) -> None:
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.0)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride

def forward(self, x: Tensor) -> Tensor:
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)

out = self.conv3(out)
out = self.bn3(out)

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)

return out


class ResNet(nn.Module):

def __init__(
self,
block: Type[Union[BasicBlock, Bottleneck]],
layers: List[int],
in_channel_dim: int = 3,
zero_init_residual: bool = False,
use_last_block_grid: bool = False,
groups: int = 1,
width_per_group: int = 64,
replace_stride_with_dilation: Optional[List[bool]] = None,
norm_layer: Optional[Callable[..., nn.Module]] = None,
) -> None:
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer

self.use_last_block_grid = use_last_block_grid
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError('replace_stride_with_dilation should be None '
'or a 3-element tuple, got {}'.format(
replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(
in_channel_dim,
self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias=False,
)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(
block,
128,
layers[1],
stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(
block,
256,
layers[2],
stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(
block,
512,
layers[3],
stride=2,
dilate=replace_stride_with_dilation[2])
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)

# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight,
0) # type: ignore[arg-type]
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight,
0) # type: ignore[arg-type]

def _make_layer(
self,
block: Type[Union[BasicBlock, Bottleneck]],
planes: int,
blocks: int,
stride: int = 1,
dilate: bool = False,
) -> nn.Sequential:
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)

layers = []
layers.append(
block(
self.inplanes,
planes,
stride,
downsample,
self.groups,
self.base_width,
previous_dilation,
norm_layer,
))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
dilation=self.dilation,
norm_layer=norm_layer,
))

return nn.Sequential(*layers)

def _forward_impl(self, x: Tensor, grid: bool, level: List, both: bool,
grid_only: bool) -> Tensor:
# See note [TorchScript super()]
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)

x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)

if grid:
x_grid = []

if 3 in level:
x_grid.append(x.detach().clone())
if not both and len(level) == 1:
return x_grid

x = self.layer4(x)

if 4 in level:
x_grid.append(x.detach().clone())
if not both and len(level) == 1:
return x_grid

x = self.avgpool(x)
x = torch.flatten(x, 1)

if not grid or len(level) == 0:
return x

if grid_only:
return x_grid

if both:
return x, x_grid

return x

def forward(
self,
x: Tensor,
grid: bool = False,
level: List = [],
both: bool = False,
grid_only: bool = False,
) -> Tensor:
return self._forward_impl(x, grid, level, both, grid_only)


def resnet50(**kwargs: Any) -> ResNet:
r"""ResNet-50 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
"""
return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)

+ 132
- 0
modelscope/models/cv/movie_scene_segmentation/utils/trn.py View File

@@ -0,0 +1,132 @@
# ------------------------------------------------------------------------------------
# BaSSL
# Copyright (c) 2021 KakaoBrain. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# Github: https://github.com/kakaobrain/bassl
# ------------------------------------------------------------------------------------

import torch
import torch.nn as nn
from transformers.models.bert.modeling_bert import BertEncoder


class ShotEmbedding(nn.Module):

def __init__(self, cfg):
super().__init__()

nn_size = cfg.neighbor_size + 2 # +1 for center shot, +1 for cls
self.shot_embedding = nn.Linear(cfg.input_dim, cfg.hidden_size)
self.position_embedding = nn.Embedding(nn_size, cfg.hidden_size)
self.mask_embedding = nn.Embedding(2, cfg.input_dim, padding_idx=0)

# tf naming convention for layer norm
self.LayerNorm = nn.LayerNorm(cfg.hidden_size, eps=1e-12)
self.dropout = nn.Dropout(cfg.hidden_dropout_prob)

self.register_buffer('pos_ids',
torch.arange(nn_size, dtype=torch.long))

def forward(
self,
shot_emb: torch.Tensor,
mask: torch.Tensor = None,
pos_ids: torch.Tensor = None,
) -> torch.Tensor:

assert len(shot_emb.size()) == 3

if pos_ids is None:
pos_ids = self.pos_ids

# this for mask embedding (un-masked ones remain unchanged)
if mask is not None:
self.mask_embedding.weight.data[0, :].fill_(0)
mask_emb = self.mask_embedding(mask.long())
shot_emb = (shot_emb * (1 - mask).float()[:, :, None]) + mask_emb

# we set [CLS] token to averaged feature
cls_emb = shot_emb.mean(dim=1)

# embedding shots
shot_emb = torch.cat([cls_emb[:, None, :], shot_emb], dim=1)
shot_emb = self.shot_embedding(shot_emb)
pos_emb = self.position_embedding(pos_ids)
embeddings = shot_emb + pos_emb[None, :]
embeddings = self.dropout(self.LayerNorm(embeddings))
return embeddings


class TransformerCRN(nn.Module):

def __init__(self, cfg):
super().__init__()

self.pooling_method = cfg.pooling_method
self.shot_embedding = ShotEmbedding(cfg)
self.encoder = BertEncoder(cfg)

nn_size = cfg.neighbor_size + 2 # +1 for center shot, +1 for cls
self.register_buffer(
'attention_mask',
self._get_extended_attention_mask(
torch.ones((1, nn_size)).float()),
)

def forward(
self,
shot: torch.Tensor,
mask: torch.Tensor = None,
pos_ids: torch.Tensor = None,
pooling_method: str = None,
):
if self.attention_mask.shape[1] != (shot.shape[1] + 1):
n_shot = shot.shape[1] + 1 # +1 for CLS token
attention_mask = self._get_extended_attention_mask(
torch.ones((1, n_shot), dtype=torch.float, device=shot.device))
else:
attention_mask = self.attention_mask

shot_emb = self.shot_embedding(shot, mask=mask, pos_ids=pos_ids)
encoded_emb = self.encoder(
shot_emb, attention_mask=attention_mask).last_hidden_state

return encoded_emb, self.pooler(
encoded_emb, pooling_method=pooling_method)

def pooler(self, sequence_output, pooling_method=None):
if pooling_method is None:
pooling_method = self.pooling_method

if pooling_method == 'cls':
return sequence_output[:, 0, :]
elif pooling_method == 'avg':
return sequence_output[:, 1:].mean(dim=1)
elif pooling_method == 'max':
return sequence_output[:, 1:].max(dim=1)[0]
elif pooling_method == 'center':
cidx = sequence_output.shape[1] // 2
return sequence_output[:, cidx, :]
else:
raise ValueError

def _get_extended_attention_mask(self, attention_mask):

# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
f'Wrong shape for attention_mask (shape {attention_mask.shape})'
)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask

+ 2
- 0
modelscope/models/cv/object_detection/__init__.py View File

@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .mmdet_model import DetectionModel
from .yolox_pai import YOLOX

else:
_import_structure = {
'mmdet_model': ['DetectionModel'],
'yolox_pai': ['YOLOX']
}

import sys


+ 16
- 0
modelscope/models/cv/object_detection/yolox_pai.py View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.detection.detectors import YOLOX as _YOLOX

from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks


@MODELS.register_module(
group_key=Tasks.image_object_detection, module_name=Models.yolox)
class YOLOX(EasyCVBaseModel, _YOLOX):

def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
_YOLOX.__init__(self, *args, **kwargs)

+ 2
- 3
modelscope/models/cv/product_retrieval_embedding/item_model.py View File

@@ -13,8 +13,8 @@ from modelscope.models.cv.product_retrieval_embedding.item_embedding import (
preprocess, resnet50_embed)
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.device import create_device
from modelscope.utils.logger import get_logger
from modelscope.utils.torch_utils import create_device

logger = get_logger()

@@ -48,9 +48,8 @@ class ProductRetrievalEmbedding(TorchModel):
filter_param(src_params, own_state)
model.load_state_dict(own_state)

cpu_flag = device == 'cpu'
self.device = create_device(
cpu_flag) # device.type == "cpu" or device.type == "cuda"
device) # device.type == "cpu" or device.type == "cuda"
self.use_gpu = self.device.type == 'cuda'

# config the model path


+ 21
- 0
modelscope/models/cv/realtime_object_detection/__init__.py View File

@@ -0,0 +1,21 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .realtime_detector import RealtimeDetector
else:
_import_structure = {
'realtime_detector': ['RealtimeDetector'],
}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 85
- 0
modelscope/models/cv/realtime_object_detection/realtime_detector.py View File

@@ -0,0 +1,85 @@
import argparse
import logging as logger
import os
import os.path as osp
import time

import cv2
import json
import torch

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.preprocessors import LoadImage
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from .yolox.data.data_augment import ValTransform
from .yolox.exp import get_exp_by_name
from .yolox.utils import postprocess


@MODELS.register_module(
group_key=Tasks.image_object_detection,
module_name=Models.realtime_object_detection)
class RealtimeDetector(TorchModel):

def __init__(self, model_dir: str, *args, **kwargs):
super().__init__(model_dir, *args, **kwargs)
self.config = Config.from_file(
os.path.join(self.model_dir, ModelFile.CONFIGURATION))

# model type
self.exp = get_exp_by_name(self.config.model_type)

# build model
self.model = self.exp.get_model()
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
ckpt = torch.load(model_path, map_location='cpu')

# load the model state dict
self.model.load_state_dict(ckpt['model'])
self.model.eval()

# params setting
self.exp.num_classes = self.config.num_classes
self.confthre = self.config.conf_thr
self.num_classes = self.exp.num_classes
self.nmsthre = self.exp.nmsthre
self.test_size = self.exp.test_size
self.preproc = ValTransform(legacy=False)

def inference(self, img):
with torch.no_grad():
outputs = self.model(img)
return outputs

def forward(self, inputs):
return self.inference(inputs)

def preprocess(self, img):
img = LoadImage.convert_to_ndarray(img)
height, width = img.shape[:2]
self.ratio = min(self.test_size[0] / img.shape[0],
self.test_size[1] / img.shape[1])

img, _ = self.preproc(img, None, self.test_size)
img = torch.from_numpy(img).unsqueeze(0)
img = img.float()

return img

def postprocess(self, input):
outputs = postprocess(
input,
self.num_classes,
self.confthre,
self.nmsthre,
class_agnostic=True)

if len(outputs) == 1:
bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
scores = outputs[0][:, 5].cpu().numpy()
labels = outputs[0][:, 6].cpu().int().numpy()

return bboxes, scores, labels

+ 0
- 0
modelscope/models/cv/realtime_object_detection/yolox/__init__.py View File


+ 0
- 0
modelscope/models/cv/realtime_object_detection/yolox/data/__init__.py View File


+ 69
- 0
modelscope/models/cv/realtime_object_detection/yolox/data/data_augment.py View File

@@ -0,0 +1,69 @@
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
"""
Data augmentation functionality. Passed as callable transformations to
Dataset classes.

The data augmentation procedures were interpreted from @weiliu89's SSD paper
http://arxiv.org/abs/1512.02325
"""

import math
import random

import cv2
import numpy as np

from ..utils import xyxy2cxcywh


def preproc(img, input_size, swap=(2, 0, 1)):
if len(img.shape) == 3:
padded_img = np.ones(
(input_size[0], input_size[1], 3), dtype=np.uint8) * 114
else:
padded_img = np.ones(input_size, dtype=np.uint8) * 114

r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
resized_img = cv2.resize(
img,
(int(img.shape[1] * r), int(img.shape[0] * r)),
interpolation=cv2.INTER_LINEAR,
).astype(np.uint8)
padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img

padded_img = padded_img.transpose(swap)
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
return padded_img, r


class ValTransform:
"""
Defines the transformations that should be applied to test PIL image
for input into the network

dimension -> tensorize -> color adj

Arguments:
resize (int): input dimension to SSD
rgb_means ((int,int,int)): average RGB of the dataset
(104,117,123)
swap ((int,int,int)): final order of channels

Returns:
transform (transform) : callable transform to be applied to test/val
data
"""

def __init__(self, swap=(2, 0, 1), legacy=False):
self.swap = swap
self.legacy = legacy

# assume input is cv2 img for now
def __call__(self, img, res, input_size):
img, _ = preproc(img, input_size, self.swap)
if self.legacy:
img = img[::-1, :, :].copy()
img /= 255.0
img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
return img, np.zeros((1, 5))

+ 5
- 0
modelscope/models/cv/realtime_object_detection/yolox/exp/__init__.py View File

@@ -0,0 +1,5 @@
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

from .base_exp import BaseExp
from .build import get_exp_by_name
from .yolox_base import Exp

+ 12
- 0
modelscope/models/cv/realtime_object_detection/yolox/exp/base_exp.py View File

@@ -0,0 +1,12 @@
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

from abc import ABCMeta, abstractmethod

from torch.nn import Module


class BaseExp(metaclass=ABCMeta):

@abstractmethod
def get_model(self) -> Module:
pass

+ 18
- 0
modelscope/models/cv/realtime_object_detection/yolox/exp/build.py View File

@@ -0,0 +1,18 @@
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX

import os
import sys


def get_exp_by_name(exp_name):
exp = exp_name.replace('-',
'_') # convert string like "yolox-s" to "yolox_s"
if exp == 'yolox_s':
from .default import YoloXSExp as YoloXExp
elif exp == 'yolox_nano':
from .default import YoloXNanoExp as YoloXExp
elif exp == 'yolox_tiny':
from .default import YoloXTinyExp as YoloXExp
else:
pass
return YoloXExp()

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save