# Conflicts: # modelscope/preprocessors/multi_modal.pymaster
@@ -0,0 +1,169 @@ | |||
#!/bin/bash | |||
# default values. | |||
BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04 | |||
BASE_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel | |||
MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope | |||
python_version=3.7.13 | |||
torch_version=1.11.0 | |||
cudatoolkit_version=11.3 | |||
tensorflow_version=1.15.5 | |||
modelscope_version=None | |||
is_ci_test=False | |||
is_dsw=False | |||
is_cpu=False | |||
run_ci_test=False | |||
function usage(){ | |||
echo "usage: build.sh " | |||
echo " --python=python_version set python version, default: $python_version" | |||
echo " --torch=torch_version set pytorch version, fefault: $torch_version" | |||
echo " --cudatoolkit=cudatoolkit_version set cudatoolkit version used for pytorch, default: $cudatoolkit_version" | |||
echo " --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version" | |||
echo " --modelscope=modelscope_version set modelscope version, default: $modelscope_version" | |||
echo " --test option for run test before push image, only push on ci test pass" | |||
echo " --cpu option for build cpu version" | |||
echo " --dsw option for build dsw version" | |||
echo " --ci option for build ci version" | |||
echo " --push option for push image to remote repo" | |||
} | |||
for i in "$@"; do | |||
case $i in | |||
--python=*) | |||
python_version="${i#*=}" | |||
shift | |||
;; | |||
--torch=*) | |||
torch_version="${i#*=}" | |||
shift # pytorch version | |||
;; | |||
--tensorflow=*) | |||
tensorflow_version="${i#*=}" | |||
shift # tensorflow version | |||
;; | |||
--cudatoolkit=*) | |||
cudatoolkit_version="${i#*=}" | |||
shift # cudatoolkit for pytorch | |||
;; | |||
--modelscope=*) | |||
modelscope_version="${i#*=}" | |||
shift # cudatoolkit for pytorch | |||
;; | |||
--test) | |||
run_ci_test=True | |||
shift # will run ci test | |||
;; | |||
--cpu) | |||
is_cpu=True | |||
shift # is cpu image | |||
;; | |||
--ci) | |||
is_ci_test=True | |||
shift # is ci, will not install modelscope | |||
;; | |||
--dsw) | |||
is_dsw=True | |||
shift # is dsw, will set dsw cache location | |||
;; | |||
--push) | |||
is_push=True | |||
shift # is dsw, will set dsw cache location | |||
;; | |||
--help) | |||
usage | |||
exit 0 | |||
;; | |||
-*|--*) | |||
echo "Unknown option $i" | |||
usage | |||
exit 1 | |||
;; | |||
*) | |||
;; | |||
esac | |||
done | |||
if [ "$modelscope_version" == "None" ]; then | |||
echo "ModelScope version must specify!" | |||
exit 1 | |||
fi | |||
if [ "$is_cpu" == "True" ]; then | |||
export BASE_IMAGE=$BASE_CPU_IMAGE | |||
base_tag=ubuntu20.04 | |||
export USE_GPU=False | |||
else | |||
export BASE_IMAGE=$BASE_GPU_IMAGE | |||
base_tag=ubuntu20.04-cuda11.3.0 | |||
export USE_GPU=True | |||
fi | |||
if [[ $python_version == 3.7* ]]; then | |||
base_tag=$base_tag-py37 | |||
elif [[ $python_version == z* ]]; then | |||
base_tag=$base_tag-py38 | |||
elif [[ $python_version == z* ]]; then | |||
base_tag=$base_tag-py39 | |||
else | |||
echo "Unsupport python version: $python_version" | |||
exit 1 | |||
fi | |||
target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version | |||
if [ "$is_ci_test" == "True" ]; then | |||
target_image_tag=$target_image_tag-$modelscope_version-ci | |||
else | |||
target_image_tag=$target_image_tag-$modelscope_version-test | |||
fi | |||
export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag | |||
export PYTHON_VERSION=$python_version | |||
export TORCH_VERSION=$torch_version | |||
export CUDATOOLKIT_VERSION=$cudatoolkit_version | |||
export TENSORFLOW_VERSION=$tensorflow_version | |||
echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n" | |||
docker_file_content=`cat docker/Dockerfile.ubuntu` | |||
if [ "$is_ci_test" != "True" ]; then | |||
echo "Building ModelScope lib, will install ModelScope lib to image" | |||
docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir modelscope==$modelscope_version -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html" | |||
fi | |||
echo "$is_dsw" | |||
if [ "$is_dsw" == "False" ]; then | |||
echo "Not DSW image" | |||
else | |||
echo "Building dsw image well need set ModelScope lib cache location." | |||
docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope" | |||
fi | |||
printf "$docker_file_content" > Dockerfile | |||
docker build -t $IMAGE_TO_BUILD \ | |||
--build-arg USE_GPU \ | |||
--build-arg BASE_IMAGE \ | |||
--build-arg PYTHON_VERSION \ | |||
--build-arg TORCH_VERSION \ | |||
--build-arg CUDATOOLKIT_VERSION \ | |||
--build-arg TENSORFLOW_VERSION \ | |||
-f Dockerfile . | |||
if [ $? -ne 0 ]; then | |||
echo "Running docker build command error, please check the log!" | |||
exit -1 | |||
fi | |||
if [ "$run_ci_test" == "True" ]; then | |||
echo "Running ci case." | |||
export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache | |||
export MODELSCOPE_HOME_CACHE=/home/mulin.lyh/ci_case_home # for credential | |||
export IMAGE_NAME=$MODELSCOPE_REPO_ADDRESS | |||
export IMAGE_VERSION=$target_image_tag | |||
export MODELSCOPE_DOMAIN=www.modelscope.cn | |||
export HUB_DATASET_ENDPOINT=http://www.modelscope.cn | |||
export CI_TEST=True | |||
export TEST_LEVEL=1 | |||
if [ "$is_ci_test" != "True" ]; then | |||
echo "Testing for dsw image or MaaS-lib image" | |||
export CI_COMMAND="python tests/run.py" | |||
fi | |||
bash .dev_scripts/dockerci.sh | |||
if [ $? -ne 0 ]; then | |||
echo "Running unittest failed, please check the log!" | |||
exit -1 | |||
fi | |||
fi | |||
if [ "$is_push" == "True" ]; then | |||
echo "Pushing image: $IMAGE_TO_BUILD" | |||
docker push $IMAGE_TO_BUILD | |||
fi |
@@ -16,5 +16,14 @@ if [ $? -ne 0 ]; then | |||
echo "linter test failed, please run 'pre-commit run --all-files' to check" | |||
exit -1 | |||
fi | |||
# test with install | |||
python setup.py install | |||
PYTHONPATH=. python tests/run.py | |||
if [ $# -eq 0 ]; then | |||
ci_command="python tests/run.py --subprocess" | |||
else | |||
ci_command="$@" | |||
fi | |||
echo "Running case with command: $ci_command" | |||
$ci_command | |||
#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py |
@@ -1,5 +1,4 @@ | |||
#!/bin/bash | |||
IMAGE_NAME=reg.docker.alibaba-inc.com/dinger/modelscope | |||
MODELSCOPE_CACHE_DIR_IN_CONTAINER=/modelscope_cache | |||
CODE_DIR=$PWD | |||
CODE_DIR_IN_CONTAINER=/Maas-lib | |||
@@ -8,6 +7,8 @@ gpus='7 6 5 4 3 2 1 0' | |||
cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58' | |||
cpu_sets_arr=($cpu_sets) | |||
is_get_file_lock=false | |||
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND} | |||
echo "ci command: $CI_COMMAND" | |||
for gpu in $gpus | |||
do | |||
exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 | |||
@@ -31,10 +32,12 @@ do | |||
-e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \ | |||
-e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \ | |||
-e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \ | |||
-e TEST_LEVEL=$TEST_LEVEL \ | |||
-e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \ | |||
--workdir=$CODE_DIR_IN_CONTAINER \ | |||
--net host \ | |||
${IMAGE_NAME}:${IMAGE_VERSION} \ | |||
bash .dev_scripts/ci_container_test.sh | |||
$CI_COMMAND | |||
if [ $? -ne 0 ]; then | |||
echo "Running test case failed, please check the log!" | |||
exit -1 | |||
@@ -0,0 +1,11 @@ | |||
.gitignore | |||
tests | |||
data | |||
.dev_scripts | |||
.dockerignore | |||
.git | |||
.gitattributes | |||
.pre-commit-config.yaml | |||
.pre-commit-config_local.yaml | |||
.readthedocs.yaml | |||
Dockfile |
@@ -4,4 +4,6 @@ | |||
*.wav filter=lfs diff=lfs merge=lfs -text | |||
*.JPEG filter=lfs diff=lfs merge=lfs -text | |||
*.jpeg filter=lfs diff=lfs merge=lfs -text | |||
*.pickle filter=lfs diff=lfs merge=lfs -text | |||
*.avi filter=lfs diff=lfs merge=lfs -text | |||
*.bin filter=lfs diff=lfs merge=lfs -text |
@@ -2,7 +2,6 @@ | |||
"framework": "pytorch", | |||
"task": "image_classification", | |||
"work_dir": "./work_dir", | |||
"model": { | |||
"type": "classification", | |||
@@ -119,6 +118,7 @@ | |||
}, | |||
"train": { | |||
"work_dir": "./work_dir", | |||
"dataloader": { | |||
"batch_size_per_gpu": 2, | |||
"workers_per_gpu": 1 | |||
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:b012c7e966f6550874ccb85ef9602d483aa89b8623dff9ffcdb0faab8f2ca9ab | |||
size 218143 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a | |||
size 245864 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:4c9a7e42edc7065c16972ff56267aad63f5233e36aa5a699b84939f5bad73276 | |||
size 2451 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459 | |||
size 146140 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:59b1da30af12f76b691990363e0d221050a59cf53fc4a97e776bcb00228c6c2a | |||
size 245864 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:541183383bb06aa3ca2c44a68cd51c1be5e3e984a1dee2c58092b9552660f3ce | |||
size 61883 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:8f0afcd9d2aa5ac9569114203bd9db4f1a520c903a88fd4854370cdde0e7eab7 | |||
size 119940 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280 | |||
size 119940 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705 | |||
size 119619 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a | |||
size 119619 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62 | |||
size 62231 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a | |||
size 62235 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572 | |||
size 60801 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c | |||
size 60801 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85 | |||
size 61589 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:7b8f50a0537bfe7e082c5ad91b2b7ece61a0adbeb7489988e553909276bf920c | |||
size 44217644 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f | |||
size 126815483 |
@@ -0,0 +1,84 @@ | |||
ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel | |||
FROM $BASE_IMAGE | |||
ARG DEBIAN_FRONTEND=noninteractive | |||
ENV TZ=Asia/Shanghai | |||
ENV CONDA_DIR /opt/conda | |||
ENV PATH="${CONDA_DIR}/bin:${PATH}" | |||
ENV arch=x86_64 | |||
SHELL ["/bin/bash", "-c"] | |||
COPY docker/rcfiles /tmp/resources | |||
RUN apt-get update && apt-get install -y --reinstall ca-certificates && \ | |||
cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \ | |||
apt-get update && \ | |||
apt-get install -y locales wget git vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \ | |||
wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \ | |||
dpkg -i ./git-lfs_3.2.0_amd64.deb && \ | |||
rm -f ./git-lfs_3.2.0_amd64.deb && \ | |||
locale-gen zh_CN && \ | |||
locale-gen zh_CN.utf8 && \ | |||
update-locale LANG=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 && \ | |||
ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ | |||
dpkg-reconfigure --frontend noninteractive tzdata && \ | |||
apt-get clean && \ | |||
rm -rf /var/lib/apt/lists/* | |||
ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 | |||
#install and config python | |||
ARG PYTHON_VERSION=3.7.13 | |||
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \ | |||
/bin/bash miniconda.sh -b -p /opt/conda && \ | |||
rm -f miniconda.sh && \ | |||
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ | |||
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ | |||
cp /tmp/resources/conda.tuna ~/.condarc && \ | |||
source /root/.bashrc && \ | |||
conda install --yes python==${PYTHON_VERSION} && \ | |||
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple | |||
ARG USE_GPU=True | |||
# install pytorch | |||
ARG TORCH_VERSION=1.12.0 | |||
ARG CUDATOOLKIT_VERSION=11.3 | |||
RUN if [ "$USE_GPU" = "True" ] ; then \ | |||
conda install --yes pytorch==$TORCH_VERSION torchvision torchaudio cudatoolkit=$CUDATOOLKIT_VERSION -c pytorch && conda clean --yes --all; \ | |||
else \ | |||
conda install pytorch==$TORCH_VERSION torchvision torchaudio cpuonly -c pytorch; \ | |||
fi | |||
# install tensorflow | |||
ARG TENSORFLOW_VERSION=1.15.5 | |||
RUN if [ "$USE_GPU" = "True" ] ; then \ | |||
pip install --no-cache-dir --use-deprecated=legacy-resolver tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ | |||
else \ | |||
pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \ | |||
fi | |||
RUN if [ "$USE_GPU" = "True" ] ; then \ | |||
CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir mmcv-full && pip cache purge; \ | |||
else \ | |||
MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir mmcv-full && pip cache purge; \ | |||
fi | |||
# install modelscope | |||
COPY requirements /var/modelscope | |||
RUN pip install --no-cache-dir --upgrade pip && \ | |||
pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||
pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||
pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||
pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||
pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ | |||
pip cache purge | |||
# default shell bash | |||
ENV SHELL=/bin/bash | |||
# install special package | |||
RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 datasets==2.1.0 | |||
RUN if [ "$USE_GPU" = "True" ] ; then \ | |||
pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \ | |||
else \ | |||
pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \ | |||
fi |
@@ -0,0 +1,15 @@ | |||
channels: | |||
- defaults | |||
show_channel_urls: true | |||
default_channels: | |||
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main | |||
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r | |||
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2 | |||
custom_channels: | |||
conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud | |||
msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud | |||
bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud | |||
menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud | |||
pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud | |||
pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud | |||
simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud |
@@ -0,0 +1,13 @@ | |||
# 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 | |||
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse | |||
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse | |||
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse | |||
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse | |||
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse | |||
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse | |||
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse | |||
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse | |||
# 预发布软件源,不建议启用 | |||
# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse | |||
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse |
@@ -108,7 +108,7 @@ pip install -e ".[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releas | |||
```shell | |||
pip install -e ".[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |||
``` | |||
### | |||
### 安装验证 | |||
安装成功后,可以执行如下命令进行验证安装是否正确: | |||
@@ -1,2 +1,2 @@ | |||
from .file import File | |||
from .file import File, LocalStorage | |||
from .io import dump, dumps, load |
@@ -240,7 +240,7 @@ class File(object): | |||
@staticmethod | |||
def _get_storage(uri): | |||
assert isinstance(uri, | |||
str), f'uri should be str type, buf got {type(uri)}' | |||
str), f'uri should be str type, but got {type(uri)}' | |||
if '://' not in uri: | |||
# local path | |||
@@ -1,5 +1,4 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import json | |||
import numpy as np | |||
from .base import FormatHandler | |||
@@ -22,14 +21,16 @@ def set_default(obj): | |||
class JsonHandler(FormatHandler): | |||
"""Use jsonplus, serialization of Python types to JSON that "just works".""" | |||
def load(self, file): | |||
return json.load(file) | |||
import jsonplus | |||
return jsonplus.loads(file.read()) | |||
def dump(self, obj, file, **kwargs): | |||
kwargs.setdefault('default', set_default) | |||
json.dump(obj, file, **kwargs) | |||
file.write(self.dumps(obj, **kwargs)) | |||
def dumps(self, obj, **kwargs): | |||
import jsonplus | |||
kwargs.setdefault('default', set_default) | |||
return json.dumps(obj, **kwargs) | |||
return jsonplus.dumps(obj, **kwargs) |
@@ -1,7 +1,6 @@ | |||
import os | |||
import pickle | |||
import shutil | |||
import subprocess | |||
from collections import defaultdict | |||
from http import HTTPStatus | |||
from http.cookiejar import CookieJar | |||
@@ -16,8 +15,7 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, | |||
API_RESPONSE_FIELD_MESSAGE, | |||
API_RESPONSE_FIELD_USERNAME, | |||
DEFAULT_CREDENTIALS_PATH) | |||
from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH, | |||
HUB_DATASET_ENDPOINT) | |||
from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH | |||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||
DEFAULT_MODEL_REVISION, | |||
DatasetFormations, DatasetMetaFormats, | |||
@@ -26,7 +24,8 @@ from modelscope.utils.logger import get_logger | |||
from .errors import (InvalidParameter, NotExistError, RequestError, | |||
datahub_raise_on_error, handle_http_response, is_ok, | |||
raise_on_error) | |||
from .utils.utils import get_endpoint, model_id_to_group_owner_name | |||
from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, | |||
model_id_to_group_owner_name) | |||
logger = get_logger() | |||
@@ -35,7 +34,8 @@ class HubApi: | |||
def __init__(self, endpoint=None, dataset_endpoint=None): | |||
self.endpoint = endpoint if endpoint is not None else get_endpoint() | |||
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else HUB_DATASET_ENDPOINT | |||
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( | |||
) | |||
def login( | |||
self, | |||
@@ -376,6 +376,27 @@ class HubApi: | |||
f'ststoken?Revision={revision}' | |||
return self.datahub_remote_call(datahub_url) | |||
def get_dataset_access_config_session( | |||
self, | |||
cookies: CookieJar, | |||
dataset_name: str, | |||
namespace: str, | |||
revision: Optional[str] = DEFAULT_DATASET_REVISION): | |||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||
f'ststoken?Revision={revision}' | |||
cookies = requests.utils.dict_from_cookiejar(cookies) | |||
r = requests.get(url=datahub_url, cookies=cookies) | |||
resp = r.json() | |||
datahub_raise_on_error(datahub_url, resp) | |||
return resp['Data'] | |||
def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | |||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | |||
r = requests.post(url) | |||
r.raise_for_status() | |||
@staticmethod | |||
def datahub_remote_call(url): | |||
r = requests.get(url) | |||
@@ -383,6 +404,9 @@ class HubApi: | |||
datahub_raise_on_error(url, resp) | |||
return resp['Data'] | |||
def check_cookies_upload_data(self, use_cookies) -> CookieJar: | |||
return self._check_cookie(use_cookies=use_cookies) | |||
class ModelScopeConfig: | |||
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) | |||
@@ -1,3 +1,5 @@ | |||
from pathlib import Path | |||
MODELSCOPE_URL_SCHEME = 'http://' | |||
DEFAULT_MODELSCOPE_DOMAIN = 'www.modelscope.cn' | |||
DEFAULT_MODELSCOPE_DATA_ENDPOINT = MODELSCOPE_URL_SCHEME + DEFAULT_MODELSCOPE_DOMAIN | |||
@@ -6,7 +8,7 @@ DEFAULT_MODELSCOPE_GROUP = 'damo' | |||
MODEL_ID_SEPARATOR = '/' | |||
FILE_HASH = 'Sha256' | |||
LOGGER_NAME = 'ModelScopeHub' | |||
DEFAULT_CREDENTIALS_PATH = '~/.modelscope/credentials' | |||
DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials') | |||
API_RESPONSE_FIELD_DATA = 'Data' | |||
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken' | |||
API_RESPONSE_FIELD_USERNAME = 'Username' | |||
@@ -49,8 +49,8 @@ def handle_http_response(response, logger, cookies, model_id): | |||
except HTTPError: | |||
if cookies is None: # code in [403] and | |||
logger.error( | |||
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be private. \ | |||
Please login first.') | |||
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ | |||
private. Please login first.') | |||
raise | |||
@@ -2,7 +2,8 @@ import os | |||
from typing import Optional | |||
from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException | |||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION | |||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, | |||
DEFAULT_MODEL_REVISION) | |||
from modelscope.utils.logger import get_logger | |||
from .api import ModelScopeConfig | |||
from .git import GitCommandWrapper | |||
@@ -15,14 +16,12 @@ class Repository: | |||
"""A local representation of the model git repository. | |||
""" | |||
def __init__( | |||
self, | |||
model_dir: str, | |||
clone_from: str, | |||
revision: Optional[str] = DEFAULT_MODEL_REVISION, | |||
auth_token: Optional[str] = None, | |||
git_path: Optional[str] = None, | |||
): | |||
def __init__(self, | |||
model_dir: str, | |||
clone_from: str, | |||
revision: Optional[str] = DEFAULT_MODEL_REVISION, | |||
auth_token: Optional[str] = None, | |||
git_path: Optional[str] = None): | |||
""" | |||
Instantiate a Repository object by cloning the remote ModelScopeHub repo | |||
Args: | |||
@@ -86,6 +85,7 @@ class Repository: | |||
branch: Optional[str] = DEFAULT_MODEL_REVISION, | |||
force: bool = False): | |||
"""Push local files to remote, this method will do. | |||
git pull | |||
git add | |||
git commit | |||
git push | |||
@@ -117,3 +117,105 @@ class Repository: | |||
url=url, | |||
local_branch=branch, | |||
remote_branch=branch) | |||
class DatasetRepository: | |||
"""A local representation of the dataset (metadata) git repository. | |||
""" | |||
def __init__(self, | |||
repo_work_dir: str, | |||
dataset_id: str, | |||
revision: Optional[str] = DEFAULT_DATASET_REVISION, | |||
auth_token: Optional[str] = None, | |||
git_path: Optional[str] = None): | |||
""" | |||
Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo | |||
Args: | |||
repo_work_dir(`str`): | |||
The dataset repo root directory. | |||
dataset_id: | |||
dataset id in ModelScope from which git clone | |||
revision(`Optional[str]`): | |||
revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash | |||
auth_token(`Optional[str]`): | |||
token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter | |||
as the token is already saved when you login the first time, if None, we will use saved token. | |||
git_path:(`Optional[str]`): | |||
The git command line path, if None, we use 'git' | |||
""" | |||
self.dataset_id = dataset_id | |||
self.repo_work_dir = repo_work_dir | |||
self.repo_base_dir = os.path.dirname(repo_work_dir) | |||
self.repo_name = os.path.basename(repo_work_dir) | |||
self.revision = revision | |||
if auth_token: | |||
self.auth_token = auth_token | |||
else: | |||
self.auth_token = ModelScopeConfig.get_token() | |||
self.git_wrapper = GitCommandWrapper(git_path) | |||
os.makedirs(self.repo_work_dir, exist_ok=True) | |||
self.repo_url = self._get_repo_url(dataset_id=dataset_id) | |||
def clone(self) -> str: | |||
# check local repo dir, directory not empty. | |||
if os.listdir(self.repo_work_dir): | |||
remote_url = self._get_remote_url() | |||
remote_url = self.git_wrapper.remove_token_from_url(remote_url) | |||
# no need clone again | |||
if remote_url and remote_url == self.repo_url: | |||
return '' | |||
logger.info('Cloning repo from {} '.format(self.repo_url)) | |||
self.git_wrapper.clone(self.repo_base_dir, self.auth_token, | |||
self.repo_url, self.repo_name, self.revision) | |||
return self.repo_work_dir | |||
def push(self, | |||
commit_message: str, | |||
branch: Optional[str] = DEFAULT_DATASET_REVISION, | |||
force: bool = False): | |||
"""Push local files to remote, this method will do. | |||
git pull | |||
git add | |||
git commit | |||
git push | |||
Args: | |||
commit_message (str): commit message | |||
branch (Optional[str], optional): which branch to push. | |||
force (Optional[bool]): whether to use forced-push. | |||
""" | |||
if commit_message is None or not isinstance(commit_message, str): | |||
msg = 'commit_message must be provided!' | |||
raise InvalidParameter(msg) | |||
if not isinstance(force, bool): | |||
raise InvalidParameter('force must be bool') | |||
if not self.auth_token: | |||
raise NotLoginException('Must login to push, please login first.') | |||
self.git_wrapper.config_auth_token(self.repo_work_dir, self.auth_token) | |||
self.git_wrapper.add_user_info(self.repo_base_dir, self.repo_name) | |||
remote_url = self.git_wrapper.get_repo_remote_url(self.repo_work_dir) | |||
self.git_wrapper.pull(self.repo_work_dir) | |||
self.git_wrapper.add(self.repo_work_dir, all_files=True) | |||
self.git_wrapper.commit(self.repo_work_dir, commit_message) | |||
self.git_wrapper.push( | |||
repo_dir=self.repo_work_dir, | |||
token=self.auth_token, | |||
url=remote_url, | |||
local_branch=branch, | |||
remote_branch=branch) | |||
def _get_repo_url(self, dataset_id): | |||
return f'{get_endpoint()}/datasets/{dataset_id}.git' | |||
def _get_remote_url(self): | |||
try: | |||
remote = self.git_wrapper.get_repo_remote_url(self.repo_work_dir) | |||
except GitError: | |||
remote = None | |||
return remote |
@@ -1,7 +1,9 @@ | |||
import hashlib | |||
import os | |||
from typing import Optional | |||
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, | |||
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, | |||
DEFAULT_MODELSCOPE_DOMAIN, | |||
DEFAULT_MODELSCOPE_GROUP, | |||
MODEL_ID_SEPARATOR, | |||
MODELSCOPE_URL_SCHEME) | |||
@@ -22,14 +24,16 @@ def model_id_to_group_owner_name(model_id): | |||
return group_or_owner, name | |||
def get_cache_dir(): | |||
def get_cache_dir(model_id: Optional[str] = None): | |||
""" | |||
cache dir precedence: | |||
function parameter > enviroment > ~/.cache/modelscope/hub | |||
""" | |||
default_cache_dir = get_default_cache_dir() | |||
return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir, | |||
'hub')) | |||
base_path = os.getenv('MODELSCOPE_CACHE', | |||
os.path.join(default_cache_dir, 'hub')) | |||
return base_path if model_id is None else os.path.join( | |||
base_path, model_id + '/') | |||
def get_endpoint(): | |||
@@ -38,6 +42,11 @@ def get_endpoint(): | |||
return MODELSCOPE_URL_SCHEME + modelscope_domain | |||
def get_dataset_hub_endpoint(): | |||
return os.environ.get('HUB_DATASET_ENDPOINT', | |||
DEFAULT_MODELSCOPE_DATA_ENDPOINT) | |||
def compute_hash(file_path): | |||
BUFFER_SIZE = 1024 * 64 # 64k buffer size | |||
sha256_hash = hashlib.sha256() | |||
@@ -11,6 +11,7 @@ class Models(object): | |||
""" | |||
# vision models | |||
detection = 'detection' | |||
realtime_object_detection = 'realtime-object-detection' | |||
scrfd = 'scrfd' | |||
classification_model = 'ClassificationModel' | |||
nafnet = 'nafnet' | |||
@@ -19,7 +20,18 @@ class Models(object): | |||
gpen = 'gpen' | |||
product_retrieval_embedding = 'product-retrieval-embedding' | |||
body_2d_keypoints = 'body-2d-keypoints' | |||
body_3d_keypoints = 'body-3d-keypoints' | |||
crowd_counting = 'HRNetCrowdCounting' | |||
panoptic_segmentation = 'swinL-panoptic-segmentation' | |||
image_reid_person = 'passvitb' | |||
video_summarization = 'pgl-video-summarization' | |||
swinL_semantic_segmentation = 'swinL-semantic-segmentation' | |||
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | |||
resnet50_bert = 'resnet50-bert' | |||
# EasyCV models | |||
yolox = 'YOLOX' | |||
segformer = 'Segformer' | |||
# nlp models | |||
bert = 'bert' | |||
@@ -32,8 +44,10 @@ class Models(object): | |||
space_modeling = 'space-modeling' | |||
star = 'star' | |||
tcrf = 'transformer-crf' | |||
lcrf = 'lstm-crf' | |||
bart = 'bart' | |||
gpt3 = 'gpt3' | |||
bert_for_ds = 'bert-for-document-segmentation' | |||
# audio models | |||
sambert_hifigan = 'sambert-hifigan' | |||
@@ -48,12 +62,14 @@ class Models(object): | |||
gemm = 'gemm-generative-multi-modal' | |||
mplug = 'mplug' | |||
diffusion = 'diffusion-text-to-image-synthesis' | |||
team = 'team-multi-modal-similarity' | |||
video_clip = 'video-clip-multi-modal-embedding' | |||
class TaskModels(object): | |||
# nlp task | |||
text_classification = 'text-classification' | |||
information_extraction = 'information-extraction' | |||
class Heads(object): | |||
@@ -63,6 +79,7 @@ class Heads(object): | |||
bert_mlm = 'bert-mlm' | |||
# roberta mlm | |||
roberta_mlm = 'roberta-mlm' | |||
information_extraction = 'information-extraction' | |||
class Pipelines(object): | |||
@@ -84,9 +101,13 @@ class Pipelines(object): | |||
animal_recognition = 'resnet101-animal-recognition' | |||
general_recognition = 'resnet101-general-recognition' | |||
cmdssl_video_embedding = 'cmdssl-r2p1d_video_embedding' | |||
hicossl_video_embedding = 'hicossl-s3dg-video_embedding' | |||
body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image' | |||
body_3d_keypoints = 'canonical_body-3d-keypoints_video' | |||
human_detection = 'resnet18-human-detection' | |||
object_detection = 'vit-object-detection' | |||
easycv_detection = 'easycv-detection' | |||
easycv_segmentation = 'easycv-segmentation' | |||
salient_detection = 'u2net-salient-detection' | |||
image_classification = 'image-classification' | |||
face_detection = 'resnet-face-detection-scrfd10gkps' | |||
@@ -100,6 +121,7 @@ class Pipelines(object): | |||
image_super_resolution = 'rrdb-image-super-resolution' | |||
face_image_generation = 'gan-face-image-generation' | |||
product_retrieval_embedding = 'resnet50-product-retrieval-embedding' | |||
realtime_object_detection = 'cspnet_realtime-object-detection_yolox' | |||
face_recognition = 'ir101-face-recognition-cfglint' | |||
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' | |||
image2image_translation = 'image-to-image-translation' | |||
@@ -112,6 +134,11 @@ class Pipelines(object): | |||
tinynas_classification = 'tinynas-classification' | |||
crowd_counting = 'hrnet-crowd-counting' | |||
video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking' | |||
image_panoptic_segmentation = 'image-panoptic-segmentation' | |||
video_summarization = 'googlenet_pgl_video_summarization' | |||
image_semantic_segmentation = 'image-semantic-segmentation' | |||
image_reid_person = 'passvitb-image-reid-person' | |||
movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | |||
# nlp tasks | |||
sentence_similarity = 'sentence-similarity' | |||
@@ -129,7 +156,10 @@ class Pipelines(object): | |||
dialog_state_tracking = 'dialog-state-tracking' | |||
zero_shot_classification = 'zero-shot-classification' | |||
text_error_correction = 'text-error-correction' | |||
faq_question_answering = 'faq-question-answering' | |||
conversational_text_to_sql = 'conversational-text-to-sql' | |||
relation_extraction = 'relation-extraction' | |||
document_segmentation = 'document-segmentation' | |||
# audio tasks | |||
sambert_hifigan_tts = 'sambert-hifigan-tts' | |||
@@ -146,8 +176,10 @@ class Pipelines(object): | |||
visual_question_answering = 'visual-question-answering' | |||
visual_grounding = 'visual-grounding' | |||
visual_entailment = 'visual-entailment' | |||
multi_modal_similarity = 'multi-modal-similarity' | |||
text_to_image_synthesis = 'text-to-image-synthesis' | |||
video_multi_modal_embedding = 'video-multi-modal-embedding' | |||
image_text_retrieval = 'image-text-retrieval' | |||
class Trainers(object): | |||
@@ -161,6 +193,7 @@ class Trainers(object): | |||
""" | |||
default = 'trainer' | |||
easycv = 'easycv' | |||
# multi-modal trainers | |||
clip_multi_modal_embedding = 'clip-multi-modal-embedding' | |||
@@ -169,12 +202,17 @@ class Trainers(object): | |||
# cv trainers | |||
image_instance_segmentation = 'image-instance-segmentation' | |||
image_portrait_enhancement = 'image-portrait-enhancement' | |||
video_summarization = 'video-summarization' | |||
movie_scene_segmentation = 'movie-scene-segmentation' | |||
# nlp trainers | |||
bert_sentiment_analysis = 'bert-sentiment-analysis' | |||
nlp_base_trainer = 'nlp-base-trainer' | |||
nlp_veco_trainer = 'nlp-veco-trainer' | |||
# audio trainers | |||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | |||
class Preprocessors(object): | |||
""" Names for different preprocessor. | |||
@@ -193,6 +231,8 @@ class Preprocessors(object): | |||
image_color_enhance_preprocessor = 'image-color-enhance-preprocessor' | |||
image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor' | |||
image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor' | |||
video_summarization_preprocessor = 'video-summarization-preprocessor' | |||
movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor' | |||
# nlp preprocessor | |||
sen_sim_tokenizer = 'sen-sim-tokenizer' | |||
@@ -210,7 +250,10 @@ class Preprocessors(object): | |||
text_error_correction = 'text-error-correction' | |||
word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor' | |||
fill_mask = 'fill-mask' | |||
faq_question_answering_preprocessor = 'faq-question-answering-preprocessor' | |||
conversational_text_to_sql = 'conversational-text-to-sql' | |||
re_tokenizer = 're-tokenizer' | |||
document_segmentation = 'document-segmentation' | |||
# audio preprocessor | |||
linear_aec_fbank = 'linear-aec-fbank' | |||
@@ -229,6 +272,7 @@ class Metrics(object): | |||
# accuracy | |||
accuracy = 'accuracy' | |||
audio_noise_metric = 'audio-noise-metric' | |||
# metrics for image denoise task | |||
image_denoise_metric = 'image-denoise-metric' | |||
@@ -245,6 +289,9 @@ class Metrics(object): | |||
image_color_enhance_metric = 'image-color-enhance-metric' | |||
# metrics for image-portrait-enhancement task | |||
image_portrait_enhancement_metric = 'image-portrait-enhancement-metric' | |||
video_summarization_metric = 'video-summarization-metric' | |||
# metric for movie-scene-segmentation task | |||
movie_scene_segmentation_metric = 'movie-scene-segmentation-metric' | |||
class Optimizers(object): | |||
@@ -294,3 +341,12 @@ class LR_Schedulers(object): | |||
LinearWarmup = 'LinearWarmup' | |||
ConstantWarmup = 'ConstantWarmup' | |||
ExponentialWarmup = 'ExponentialWarmup' | |||
class Datasets(object): | |||
""" Names for different datasets. | |||
""" | |||
ClsDataset = 'ClsDataset' | |||
SegDataset = 'SegDataset' | |||
DetDataset = 'DetDataset' | |||
DetImagesMixDataset = 'DetImagesMixDataset' |
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .audio_noise_metric import AudioNoiseMetric | |||
from .base import Metric | |||
from .builder import METRICS, build_metric, task_default_metrics | |||
from .image_color_enhance_metric import ImageColorEnhanceMetric | |||
@@ -14,9 +15,12 @@ if TYPE_CHECKING: | |||
from .sequence_classification_metric import SequenceClassificationMetric | |||
from .text_generation_metric import TextGenerationMetric | |||
from .token_classification_metric import TokenClassificationMetric | |||
from .video_summarization_metric import VideoSummarizationMetric | |||
from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric | |||
else: | |||
_import_structure = { | |||
'audio_noise_metric': ['AudioNoiseMetric'], | |||
'base': ['Metric'], | |||
'builder': ['METRICS', 'build_metric', 'task_default_metrics'], | |||
'image_color_enhance_metric': ['ImageColorEnhanceMetric'], | |||
@@ -28,6 +32,8 @@ else: | |||
'sequence_classification_metric': ['SequenceClassificationMetric'], | |||
'text_generation_metric': ['TextGenerationMetric'], | |||
'token_classification_metric': ['TokenClassificationMetric'], | |||
'video_summarization_metric': ['VideoSummarizationMetric'], | |||
'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'], | |||
} | |||
import sys | |||
@@ -0,0 +1,38 @@ | |||
from typing import Dict | |||
from modelscope.metainfo import Metrics | |||
from modelscope.metrics.base import Metric | |||
from modelscope.metrics.builder import METRICS, MetricKeys | |||
from modelscope.utils.registry import default_group | |||
@METRICS.register_module( | |||
group_key=default_group, module_name=Metrics.audio_noise_metric) | |||
class AudioNoiseMetric(Metric): | |||
""" | |||
The metric computation class for acoustic noise suppression task. | |||
""" | |||
def __init__(self): | |||
self.loss = [] | |||
self.amp_loss = [] | |||
self.phase_loss = [] | |||
self.sisnr = [] | |||
def add(self, outputs: Dict, inputs: Dict): | |||
self.loss.append(outputs['loss'].data.cpu()) | |||
self.amp_loss.append(outputs['amp_loss'].data.cpu()) | |||
self.phase_loss.append(outputs['phase_loss'].data.cpu()) | |||
self.sisnr.append(outputs['sisnr'].data.cpu()) | |||
def evaluate(self): | |||
avg_loss = sum(self.loss) / len(self.loss) | |||
avg_sisnr = sum(self.sisnr) / len(self.sisnr) | |||
avg_amp = sum(self.amp_loss) / len(self.amp_loss) | |||
avg_phase = sum(self.phase_loss) / len(self.phase_loss) | |||
total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr | |||
return { | |||
'total_loss': total_loss.item(), | |||
'avg_sisnr': avg_sisnr.item(), | |||
MetricKeys.AVERAGE_LOSS: avg_loss.item() | |||
} |
@@ -1,4 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import Dict, Mapping, Union | |||
from modelscope.metainfo import Metrics | |||
from modelscope.utils.config import ConfigDict | |||
@@ -15,6 +16,8 @@ class MetricKeys(object): | |||
RECALL = 'recall' | |||
PSNR = 'psnr' | |||
SSIM = 'ssim' | |||
AVERAGE_LOSS = 'avg_loss' | |||
FScore = 'fscore' | |||
task_default_metrics = { | |||
@@ -28,19 +31,26 @@ task_default_metrics = { | |||
Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric], | |||
Tasks.image_portrait_enhancement: | |||
[Metrics.image_portrait_enhancement_metric], | |||
Tasks.video_summarization: [Metrics.video_summarization_metric], | |||
Tasks.image_captioning: [Metrics.text_gen_metric], | |||
Tasks.visual_question_answering: [Metrics.text_gen_metric], | |||
Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric], | |||
} | |||
def build_metric(metric_name: str, | |||
def build_metric(metric_cfg: Union[str, Dict], | |||
field: str = default_group, | |||
default_args: dict = None): | |||
""" Build metric given metric_name and field. | |||
Args: | |||
metric_name (:obj:`str`): The metric name. | |||
metric_name (str | dict): The metric name or metric config dict. | |||
field (str, optional): The field of this metric, default value: 'default' for all fields. | |||
default_args (dict, optional): Default initialization arguments. | |||
""" | |||
cfg = ConfigDict({'type': metric_name}) | |||
if isinstance(metric_cfg, Mapping): | |||
assert 'type' in metric_cfg | |||
else: | |||
metric_cfg = ConfigDict({'type': metric_cfg}) | |||
return build_from_cfg( | |||
cfg, METRICS, group_key=field, default_args=default_args) | |||
metric_cfg, METRICS, group_key=field, default_args=default_args) |
@@ -0,0 +1,52 @@ | |||
from typing import Dict | |||
import numpy as np | |||
from modelscope.metainfo import Metrics | |||
from modelscope.utils.registry import default_group | |||
from modelscope.utils.tensor_utils import (torch_nested_detach, | |||
torch_nested_numpify) | |||
from .base import Metric | |||
from .builder import METRICS, MetricKeys | |||
@METRICS.register_module( | |||
group_key=default_group, | |||
module_name=Metrics.movie_scene_segmentation_metric) | |||
class MovieSceneSegmentationMetric(Metric): | |||
"""The metric computation class for movie scene segmentation classes. | |||
""" | |||
def __init__(self): | |||
self.preds = [] | |||
self.labels = [] | |||
self.eps = 1e-5 | |||
def add(self, outputs: Dict, inputs: Dict): | |||
preds = outputs['pred'] | |||
labels = inputs['label'] | |||
self.preds.extend(preds) | |||
self.labels.extend(labels) | |||
def evaluate(self): | |||
gts = np.array(torch_nested_numpify(torch_nested_detach(self.labels))) | |||
prob = np.array(torch_nested_numpify(torch_nested_detach(self.preds))) | |||
gt_one = gts == 1 | |||
gt_zero = gts == 0 | |||
pred_one = prob == 1 | |||
pred_zero = prob == 0 | |||
tp = (gt_one * pred_one).sum() | |||
fp = (gt_zero * pred_one).sum() | |||
fn = (gt_one * pred_zero).sum() | |||
precision = 100.0 * tp / (tp + fp + self.eps) | |||
recall = 100.0 * tp / (tp + fn + self.eps) | |||
f1 = 2 * precision * recall / (precision + recall) | |||
return { | |||
MetricKeys.F1: f1, | |||
MetricKeys.RECALL: recall, | |||
MetricKeys.PRECISION: precision | |||
} |
@@ -0,0 +1,78 @@ | |||
from typing import Dict | |||
import numpy as np | |||
from modelscope.metainfo import Metrics | |||
from modelscope.models.cv.video_summarization.summarizer import \ | |||
generate_summary | |||
from modelscope.utils.registry import default_group | |||
from .base import Metric | |||
from .builder import METRICS, MetricKeys | |||
def evaluate_summary(predicted_summary, user_summary, eval_method): | |||
""" Compare the predicted summary with the user defined one(s). | |||
:param ndarray predicted_summary: The generated summary from our model. | |||
:param ndarray user_summary: The user defined ground truth summaries (or summary). | |||
:param str eval_method: The proposed evaluation method; either 'max' (SumMe) or 'avg' (TVSum). | |||
:return: The reduced fscore based on the eval_method | |||
""" | |||
max_len = max(len(predicted_summary), user_summary.shape[1]) | |||
S = np.zeros(max_len, dtype=int) | |||
G = np.zeros(max_len, dtype=int) | |||
S[:len(predicted_summary)] = predicted_summary | |||
f_scores = [] | |||
for user in range(user_summary.shape[0]): | |||
G[:user_summary.shape[1]] = user_summary[user] | |||
overlapped = S & G | |||
# Compute precision, recall, f-score | |||
precision = sum(overlapped) / sum(S) | |||
recall = sum(overlapped) / sum(G) | |||
if precision + recall == 0: | |||
f_scores.append(0) | |||
else: | |||
f_score = 2 * precision * recall * 100 / (precision + recall) | |||
f_scores.append(f_score) | |||
if eval_method == 'max': | |||
return max(f_scores) | |||
else: | |||
return sum(f_scores) / len(f_scores) | |||
def calculate_f_score(outputs: Dict, inputs: Dict): | |||
scores = outputs['scores'] | |||
scores = scores.squeeze(0).cpu().numpy().tolist() | |||
user_summary = inputs['user_summary'].cpu().numpy()[0] | |||
sb = inputs['change_points'].cpu().numpy()[0] | |||
n_frames = inputs['n_frames'].cpu().numpy()[0] | |||
positions = inputs['positions'].cpu().numpy()[0] | |||
summary = generate_summary([sb], [scores], [n_frames], [positions])[0] | |||
f_score = evaluate_summary(summary, user_summary, 'avg') | |||
return f_score | |||
@METRICS.register_module( | |||
group_key=default_group, module_name=Metrics.video_summarization_metric) | |||
class VideoSummarizationMetric(Metric): | |||
"""The metric for video summarization task. | |||
""" | |||
def __init__(self): | |||
self.inputs = [] | |||
self.outputs = [] | |||
def add(self, outputs: Dict, inputs: Dict): | |||
self.outputs.append(outputs) | |||
self.inputs.append(inputs) | |||
def evaluate(self): | |||
f_scores = [ | |||
calculate_f_score(output, input) | |||
for output, input in zip(self.outputs, self.inputs) | |||
] | |||
return {MetricKeys.FScore: sum(f_scores) / len(f_scores)} |
@@ -75,27 +75,37 @@ class FRCRNModel(TorchModel): | |||
model_bin_file = os.path.join(model_dir, | |||
ModelFile.TORCH_MODEL_BIN_FILE) | |||
if os.path.exists(model_bin_file): | |||
checkpoint = torch.load(model_bin_file) | |||
self.model.load_state_dict(checkpoint, strict=False) | |||
checkpoint = torch.load( | |||
model_bin_file, map_location=torch.device('cpu')) | |||
if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: | |||
self.model.load_state_dict( | |||
checkpoint['state_dict'], strict=False) | |||
else: | |||
self.model.load_state_dict(checkpoint, strict=False) | |||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
output = self.model.forward(input) | |||
return { | |||
'spec_l1': output[0], | |||
'wav_l1': output[1], | |||
'mask_l1': output[2], | |||
'spec_l2': output[3], | |||
'wav_l2': output[4], | |||
'mask_l2': output[5] | |||
result_list = self.model.forward(input['noisy']) | |||
output = { | |||
'spec_l1': result_list[0], | |||
'wav_l1': result_list[1], | |||
'mask_l1': result_list[2], | |||
'spec_l2': result_list[3], | |||
'wav_l2': result_list[4], | |||
'mask_l2': result_list[5] | |||
} | |||
def to(self, *args, **kwargs): | |||
self.model = self.model.to(*args, **kwargs) | |||
return self | |||
def eval(self): | |||
self.model = self.model.train(False) | |||
return self | |||
if 'clean' in input: | |||
mix_result = self.model.loss( | |||
input['noisy'], input['clean'], result_list, mode='Mix') | |||
output.update(mix_result) | |||
sisnr_result = self.model.loss( | |||
input['noisy'], input['clean'], result_list, mode='SiSNR') | |||
output.update(sisnr_result) | |||
# logger hooker will use items under 'log_vars' | |||
output['log_vars'] = {k: mix_result[k].item() for k in mix_result} | |||
output['log_vars'].update( | |||
{k: sisnr_result[k].item() | |||
for k in sisnr_result}) | |||
return output | |||
class FRCRN(nn.Module): | |||
@@ -110,7 +120,8 @@ class FRCRN(nn.Module): | |||
win_len=400, | |||
win_inc=100, | |||
fft_len=512, | |||
win_type='hanning'): | |||
win_type='hanning', | |||
**kwargs): | |||
r""" | |||
Args: | |||
complex: Whether to use complex networks. | |||
@@ -236,7 +247,7 @@ class FRCRN(nn.Module): | |||
if count != 3: | |||
loss = self.loss_1layer(noisy, est_spec, est_wav, labels, | |||
est_mask, mode) | |||
return loss | |||
return dict(sisnr=loss) | |||
elif mode == 'Mix': | |||
count = 0 | |||
@@ -251,7 +262,7 @@ class FRCRN(nn.Module): | |||
amp_loss, phase_loss, SiSNR_loss = self.loss_1layer( | |||
noisy, est_spec, est_wav, labels, est_mask, mode) | |||
loss = amp_loss + phase_loss + SiSNR_loss | |||
return loss, amp_loss, phase_loss | |||
return dict(loss=loss, amp_loss=amp_loss, phase_loss=phase_loss) | |||
def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'): | |||
r""" Compute the loss by mode | |||
@@ -33,6 +33,7 @@ class FSMNSeleNetV2Decorator(TorchModel): | |||
ModelFile.TORCH_MODEL_BIN_FILE) | |||
self._model = None | |||
if os.path.exists(model_bin_file): | |||
kwargs.pop('device') | |||
self._model = FSMNSeleNetV2(*args, **kwargs) | |||
checkpoint = torch.load(model_bin_file) | |||
self._model.load_state_dict(checkpoint, strict=False) | |||
@@ -1,15 +1,15 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import os.path as osp | |||
from abc import ABC, abstractmethod | |||
from typing import Dict, Optional, Union | |||
import numpy as np | |||
from typing import Callable, Dict, List, Optional, Union | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.models.builder import build_model | |||
from modelscope.utils.checkpoint import save_pretrained | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile | |||
from modelscope.utils.device import device_placement, verify_device | |||
from modelscope.utils.file_utils import func_receive_dict_inputs | |||
from modelscope.utils.hub import parse_label_mapping | |||
from modelscope.utils.logger import get_logger | |||
@@ -24,8 +24,7 @@ class Model(ABC): | |||
def __init__(self, model_dir, *args, **kwargs): | |||
self.model_dir = model_dir | |||
device_name = kwargs.get('device', 'gpu') | |||
assert device_name in ['gpu', | |||
'cpu'], 'device should be either cpu or gpu.' | |||
verify_device(device_name) | |||
self._device_name = device_name | |||
def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
@@ -72,6 +71,7 @@ class Model(ABC): | |||
model_name_or_path: str, | |||
revision: Optional[str] = DEFAULT_MODEL_REVISION, | |||
cfg_dict: Config = None, | |||
device: str = None, | |||
*model_args, | |||
**kwargs): | |||
""" Instantiate a model from local directory or remote model repo. Note | |||
@@ -97,7 +97,7 @@ class Model(ABC): | |||
osp.join(local_model_dir, ModelFile.CONFIGURATION)) | |||
task_name = cfg.task | |||
model_cfg = cfg.model | |||
# TODO @wenmeng.zwm may should manually initialize model after model building | |||
framework = cfg.framework | |||
if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): | |||
model_cfg.type = model_cfg.model_type | |||
@@ -105,10 +105,41 @@ class Model(ABC): | |||
model_cfg.model_dir = local_model_dir | |||
for k, v in kwargs.items(): | |||
model_cfg[k] = v | |||
model = build_model( | |||
model_cfg, task_name=task_name, default_args=kwargs) | |||
if device is not None: | |||
model_cfg.device = device | |||
with device_placement(framework, device): | |||
model = build_model( | |||
model_cfg, task_name=task_name, default_args=kwargs) | |||
else: | |||
model = build_model( | |||
model_cfg, task_name=task_name, default_args=kwargs) | |||
# dynamically add pipeline info to model for pipeline inference | |||
if hasattr(cfg, 'pipeline'): | |||
model.pipeline = cfg.pipeline | |||
return model | |||
def save_pretrained(self, | |||
target_folder: Union[str, os.PathLike], | |||
save_checkpoint_names: Union[str, List[str]] = None, | |||
save_function: Callable = None, | |||
config: Optional[dict] = None, | |||
**kwargs): | |||
"""save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded | |||
Args: | |||
target_folder (Union[str, os.PathLike]): | |||
Directory to which to save. Will be created if it doesn't exist. | |||
save_checkpoint_names (Union[str, List[str]]): | |||
The checkpoint names to be saved in the target_folder | |||
save_function (Callable, optional): | |||
The function to use to save the state dictionary. | |||
config (Optional[dict], optional): | |||
The config for the configuration.json, might not be identical with model.config | |||
""" | |||
save_pretrained(self, target_folder, save_checkpoint_names, | |||
save_function, config, **kwargs) |
@@ -1,9 +1,17 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# yapf: disable | |||
from . import (action_recognition, animal_recognition, body_2d_keypoints, | |||
cartoon, cmdssl_video_embedding, crowd_counting, face_detection, | |||
face_generation, image_classification, image_color_enhance, | |||
image_colorization, image_denoise, image_instance_segmentation, | |||
image_portrait_enhancement, image_to_image_generation, | |||
image_to_image_translation, object_detection, | |||
product_retrieval_embedding, salient_detection, | |||
super_resolution, video_single_object_tracking, virual_tryon) | |||
body_3d_keypoints, cartoon, cmdssl_video_embedding, | |||
crowd_counting, face_detection, face_generation, | |||
image_classification, image_color_enhance, image_colorization, | |||
image_denoise, image_instance_segmentation, | |||
image_panoptic_segmentation, image_portrait_enhancement, | |||
image_reid_person, image_semantic_segmentation, | |||
image_to_image_generation, image_to_image_translation, | |||
movie_scene_segmentation, object_detection, | |||
product_retrieval_embedding, realtime_object_detection, | |||
salient_detection, super_resolution, | |||
video_single_object_tracking, video_summarization, virual_tryon) | |||
# yapf: enable |
@@ -1,5 +1,6 @@ | |||
import torch.nn as nn | |||
from .s3dg import Inception3D | |||
from .tada_convnext import TadaConvNeXt | |||
@@ -26,11 +27,25 @@ class BaseVideoModel(nn.Module): | |||
super(BaseVideoModel, self).__init__() | |||
# the backbone is created according to meta-architectures | |||
# defined in models/base/backbone.py | |||
self.backbone = TadaConvNeXt(cfg) | |||
if cfg.MODEL.NAME == 'ConvNeXt_tiny': | |||
self.backbone = TadaConvNeXt(cfg) | |||
elif cfg.MODEL.NAME == 'S3DG': | |||
self.backbone = Inception3D(cfg) | |||
else: | |||
error_str = 'backbone {} is not supported, ConvNeXt_tiny or S3DG is supported'.format( | |||
cfg.MODEL.NAME) | |||
raise NotImplementedError(error_str) | |||
# the head is created according to the heads | |||
# defined in models/module_zoo/heads | |||
self.head = BaseHead(cfg) | |||
if cfg.VIDEO.HEAD.NAME == 'BaseHead': | |||
self.head = BaseHead(cfg) | |||
elif cfg.VIDEO.HEAD.NAME == 'AvgHead': | |||
self.head = AvgHead(cfg) | |||
else: | |||
error_str = 'head {} is not supported, BaseHead or AvgHead is supported'.format( | |||
cfg.VIDEO.HEAD.NAME) | |||
raise NotImplementedError(error_str) | |||
def forward(self, x): | |||
x = self.backbone(x) | |||
@@ -88,3 +103,29 @@ class BaseHead(nn.Module): | |||
out = self.activation(out) | |||
out = out.view(out.shape[0], -1) | |||
return out, x.view(x.shape[0], -1) | |||
class AvgHead(nn.Module): | |||
""" | |||
Constructs base head. | |||
""" | |||
def __init__( | |||
self, | |||
cfg, | |||
): | |||
""" | |||
Args: | |||
cfg (Config): global config object. | |||
""" | |||
super(AvgHead, self).__init__() | |||
self.cfg = cfg | |||
self.global_avg_pool = nn.AdaptiveAvgPool3d(1) | |||
def forward(self, x): | |||
if len(x.shape) == 5: | |||
x = self.global_avg_pool(x) | |||
# (N, C, T, H, W) -> (N, T, H, W, C). | |||
x = x.permute((0, 2, 3, 4, 1)) | |||
out = x.view(x.shape[0], -1) | |||
return out, x.view(x.shape[0], -1) |
@@ -0,0 +1,301 @@ | |||
import torch | |||
import torch.nn as nn | |||
class InceptionBaseConv3D(nn.Module): | |||
""" | |||
Constructs basic inception 3D conv. | |||
Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. | |||
""" | |||
def __init__(self, | |||
cfg, | |||
in_planes, | |||
out_planes, | |||
kernel_size, | |||
stride, | |||
padding=0): | |||
super(InceptionBaseConv3D, self).__init__() | |||
self.conv = nn.Conv3d( | |||
in_planes, | |||
out_planes, | |||
kernel_size=kernel_size, | |||
stride=stride, | |||
padding=padding, | |||
bias=False) | |||
self.bn = nn.BatchNorm3d(out_planes) | |||
self.relu = nn.ReLU(inplace=True) | |||
# init | |||
self.conv.weight.data.normal_( | |||
mean=0, std=0.01) # original s3d is truncated normal within 2 std | |||
self.bn.weight.data.fill_(1) | |||
self.bn.bias.data.zero_() | |||
def forward(self, x): | |||
x = self.conv(x) | |||
x = self.bn(x) | |||
x = self.relu(x) | |||
return x | |||
class InceptionBlock3D(nn.Module): | |||
""" | |||
Element constructing the S3D/S3DG. | |||
See models/base/backbone.py L99-186. | |||
Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. | |||
""" | |||
def __init__(self, cfg, in_planes, out_planes): | |||
super(InceptionBlock3D, self).__init__() | |||
_gating = cfg.VIDEO.BACKBONE.BRANCH.GATING | |||
assert len(out_planes) == 6 | |||
assert isinstance(out_planes, list) | |||
[ | |||
num_out_0_0a, num_out_1_0a, num_out_1_0b, num_out_2_0a, | |||
num_out_2_0b, num_out_3_0b | |||
] = out_planes | |||
self.branch0 = nn.Sequential( | |||
InceptionBaseConv3D( | |||
cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), ) | |||
self.branch1 = nn.Sequential( | |||
InceptionBaseConv3D( | |||
cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1), | |||
STConv3d( | |||
cfg, | |||
num_out_1_0a, | |||
num_out_1_0b, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1), | |||
) | |||
self.branch2 = nn.Sequential( | |||
InceptionBaseConv3D( | |||
cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1), | |||
STConv3d( | |||
cfg, | |||
num_out_2_0a, | |||
num_out_2_0b, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1), | |||
) | |||
self.branch3 = nn.Sequential( | |||
nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1), | |||
InceptionBaseConv3D( | |||
cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1), | |||
) | |||
self.out_channels = sum( | |||
[num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b]) | |||
self.gating = _gating | |||
if _gating: | |||
self.gating_b0 = SelfGating(num_out_0_0a) | |||
self.gating_b1 = SelfGating(num_out_1_0b) | |||
self.gating_b2 = SelfGating(num_out_2_0b) | |||
self.gating_b3 = SelfGating(num_out_3_0b) | |||
def forward(self, x): | |||
x0 = self.branch0(x) | |||
x1 = self.branch1(x) | |||
x2 = self.branch2(x) | |||
x3 = self.branch3(x) | |||
if self.gating: | |||
x0 = self.gating_b0(x0) | |||
x1 = self.gating_b1(x1) | |||
x2 = self.gating_b2(x2) | |||
x3 = self.gating_b3(x3) | |||
out = torch.cat((x0, x1, x2, x3), 1) | |||
return out | |||
class SelfGating(nn.Module): | |||
def __init__(self, input_dim): | |||
super(SelfGating, self).__init__() | |||
self.fc = nn.Linear(input_dim, input_dim) | |||
def forward(self, input_tensor): | |||
"""Feature gating as used in S3D-G""" | |||
spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4]) | |||
weights = self.fc(spatiotemporal_average) | |||
weights = torch.sigmoid(weights) | |||
return weights[:, :, None, None, None] * input_tensor | |||
class STConv3d(nn.Module): | |||
""" | |||
Element constructing the S3D/S3DG. | |||
See models/base/backbone.py L99-186. | |||
Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. | |||
""" | |||
def __init__(self, | |||
cfg, | |||
in_planes, | |||
out_planes, | |||
kernel_size, | |||
stride, | |||
padding=0): | |||
super(STConv3d, self).__init__() | |||
if isinstance(stride, tuple): | |||
t_stride = stride[0] | |||
stride = stride[-1] | |||
else: # int | |||
t_stride = stride | |||
self.bn_mmt = cfg.BN.MOMENTUM | |||
self.bn_eps = float(cfg.BN.EPS) | |||
self._construct_branch(cfg, in_planes, out_planes, kernel_size, stride, | |||
t_stride, padding) | |||
def _construct_branch(self, | |||
cfg, | |||
in_planes, | |||
out_planes, | |||
kernel_size, | |||
stride, | |||
t_stride, | |||
padding=0): | |||
self.conv1 = nn.Conv3d( | |||
in_planes, | |||
out_planes, | |||
kernel_size=(1, kernel_size, kernel_size), | |||
stride=(1, stride, stride), | |||
padding=(0, padding, padding), | |||
bias=False) | |||
self.conv2 = nn.Conv3d( | |||
out_planes, | |||
out_planes, | |||
kernel_size=(kernel_size, 1, 1), | |||
stride=(t_stride, 1, 1), | |||
padding=(padding, 0, 0), | |||
bias=False) | |||
self.bn1 = nn.BatchNorm3d( | |||
out_planes, eps=self.bn_eps, momentum=self.bn_mmt) | |||
self.bn2 = nn.BatchNorm3d( | |||
out_planes, eps=self.bn_eps, momentum=self.bn_mmt) | |||
self.relu = nn.ReLU(inplace=True) | |||
# init | |||
self.conv1.weight.data.normal_( | |||
mean=0, std=0.01) # original s3d is truncated normal within 2 std | |||
self.conv2.weight.data.normal_( | |||
mean=0, std=0.01) # original s3d is truncated normal within 2 std | |||
self.bn1.weight.data.fill_(1) | |||
self.bn1.bias.data.zero_() | |||
self.bn2.weight.data.fill_(1) | |||
self.bn2.bias.data.zero_() | |||
def forward(self, x): | |||
x = self.conv1(x) | |||
x = self.bn1(x) | |||
x = self.relu(x) | |||
x = self.conv2(x) | |||
x = self.bn2(x) | |||
x = self.relu(x) | |||
return x | |||
class Inception3D(nn.Module): | |||
""" | |||
Backbone architecture for I3D/S3DG. | |||
Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. | |||
""" | |||
def __init__(self, cfg): | |||
""" | |||
Args: | |||
cfg (Config): global config object. | |||
""" | |||
super(Inception3D, self).__init__() | |||
_input_channel = cfg.DATA.NUM_INPUT_CHANNELS | |||
self._construct_backbone(cfg, _input_channel) | |||
def _construct_backbone(self, cfg, input_channel): | |||
# ------------------- Block 1 ------------------- | |||
self.Conv_1a = STConv3d( | |||
cfg, input_channel, 64, kernel_size=7, stride=2, padding=3) | |||
self.block1 = nn.Sequential(self.Conv_1a) # (64, 32, 112, 112) | |||
# ------------------- Block 2 ------------------- | |||
self.MaxPool_2a = nn.MaxPool3d( | |||
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)) | |||
self.Conv_2b = InceptionBaseConv3D( | |||
cfg, 64, 64, kernel_size=1, stride=1) | |||
self.Conv_2c = STConv3d( | |||
cfg, 64, 192, kernel_size=3, stride=1, padding=1) | |||
self.block2 = nn.Sequential( | |||
self.MaxPool_2a, # (64, 32, 56, 56) | |||
self.Conv_2b, # (64, 32, 56, 56) | |||
self.Conv_2c) # (192, 32, 56, 56) | |||
# ------------------- Block 3 ------------------- | |||
self.MaxPool_3a = nn.MaxPool3d( | |||
kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)) | |||
self.Mixed_3b = InceptionBlock3D( | |||
cfg, in_planes=192, out_planes=[64, 96, 128, 16, 32, 32]) | |||
self.Mixed_3c = InceptionBlock3D( | |||
cfg, in_planes=256, out_planes=[128, 128, 192, 32, 96, 64]) | |||
self.block3 = nn.Sequential( | |||
self.MaxPool_3a, # (192, 32, 28, 28) | |||
self.Mixed_3b, # (256, 32, 28, 28) | |||
self.Mixed_3c) # (480, 32, 28, 28) | |||
# ------------------- Block 4 ------------------- | |||
self.MaxPool_4a = nn.MaxPool3d( | |||
kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1)) | |||
self.Mixed_4b = InceptionBlock3D( | |||
cfg, in_planes=480, out_planes=[192, 96, 208, 16, 48, 64]) | |||
self.Mixed_4c = InceptionBlock3D( | |||
cfg, in_planes=512, out_planes=[160, 112, 224, 24, 64, 64]) | |||
self.Mixed_4d = InceptionBlock3D( | |||
cfg, in_planes=512, out_planes=[128, 128, 256, 24, 64, 64]) | |||
self.Mixed_4e = InceptionBlock3D( | |||
cfg, in_planes=512, out_planes=[112, 144, 288, 32, 64, 64]) | |||
self.Mixed_4f = InceptionBlock3D( | |||
cfg, in_planes=528, out_planes=[256, 160, 320, 32, 128, 128]) | |||
self.block4 = nn.Sequential( | |||
self.MaxPool_4a, # (480, 16, 14, 14) | |||
self.Mixed_4b, # (512, 16, 14, 14) | |||
self.Mixed_4c, # (512, 16, 14, 14) | |||
self.Mixed_4d, # (512, 16, 14, 14) | |||
self.Mixed_4e, # (528, 16, 14, 14) | |||
self.Mixed_4f) # (832, 16, 14, 14) | |||
# ------------------- Block 5 ------------------- | |||
self.MaxPool_5a = nn.MaxPool3d( | |||
kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0)) | |||
self.Mixed_5b = InceptionBlock3D( | |||
cfg, in_planes=832, out_planes=[256, 160, 320, 32, 128, 128]) | |||
self.Mixed_5c = InceptionBlock3D( | |||
cfg, in_planes=832, out_planes=[384, 192, 384, 48, 128, 128]) | |||
self.block5 = nn.Sequential( | |||
self.MaxPool_5a, # (832, 8, 7, 7) | |||
self.Mixed_5b, # (832, 8, 7, 7) | |||
self.Mixed_5c) # (1024, 8, 7, 7) | |||
def forward(self, x): | |||
if isinstance(x, dict): | |||
x = x['video'] | |||
x = self.block1(x) | |||
x = self.block2(x) | |||
x = self.block3(x) | |||
x = self.block4(x) | |||
x = self.block5(x) | |||
return x |
@@ -0,0 +1,23 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .body_3d_pose import BodyKeypointsDetection3D | |||
else: | |||
_import_structure = { | |||
'body_3d_pose': ['BodyKeypointsDetection3D'], | |||
} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,246 @@ | |||
import logging | |||
import os.path as osp | |||
from typing import Any, Dict, List, Union | |||
import numpy as np | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.models.cv.body_3d_keypoints.canonical_pose_modules import ( | |||
TemporalModel, TransCan3Dkeys) | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
__all__ = ['BodyKeypointsDetection3D'] | |||
class KeypointsTypes(object): | |||
POSES_CAMERA = 'poses_camera' | |||
POSES_TRAJ = 'poses_traj' | |||
@MODELS.register_module( | |||
Tasks.body_3d_keypoints, module_name=Models.body_3d_keypoints) | |||
class BodyKeypointsDetection3D(TorchModel): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
super().__init__(model_dir, *args, **kwargs) | |||
self.model_dir = model_dir | |||
model_path = osp.join(self.model_dir, ModelFile.TORCH_MODEL_FILE) | |||
cfg_path = osp.join(self.model_dir, ModelFile.CONFIGURATION) | |||
self.cfg = Config.from_file(cfg_path) | |||
self._create_model() | |||
if not osp.exists(model_path): | |||
raise IOError(f'{model_path} is not exists.') | |||
if torch.cuda.is_available(): | |||
self._device = torch.device('cuda') | |||
else: | |||
self._device = torch.device('cpu') | |||
self.pretrained_state_dict = torch.load( | |||
model_path, map_location=self._device) | |||
self.load_pretrained() | |||
self.to_device(self._device) | |||
self.eval() | |||
def _create_model(self): | |||
self.model_pos = TemporalModel( | |||
self.cfg.model.MODEL.IN_NUM_JOINTS, | |||
self.cfg.model.MODEL.IN_2D_FEATURE, | |||
self.cfg.model.MODEL.OUT_NUM_JOINTS, | |||
filter_widths=self.cfg.model.MODEL.FILTER_WIDTHS, | |||
causal=self.cfg.model.MODEL.CAUSAL, | |||
dropout=self.cfg.model.MODEL.DROPOUT, | |||
channels=self.cfg.model.MODEL.CHANNELS, | |||
dense=self.cfg.model.MODEL.DENSE) | |||
receptive_field = self.model_pos.receptive_field() | |||
self.pad = (receptive_field - 1) // 2 | |||
if self.cfg.model.MODEL.CAUSAL: | |||
self.causal_shift = self.pad | |||
else: | |||
self.causal_shift = 0 | |||
self.model_traj = TransCan3Dkeys( | |||
in_channels=self.cfg.model.MODEL.IN_NUM_JOINTS | |||
* self.cfg.model.MODEL.IN_2D_FEATURE, | |||
num_features=1024, | |||
out_channels=self.cfg.model.MODEL.OUT_3D_FEATURE, | |||
num_blocks=4, | |||
time_window=receptive_field) | |||
def eval(self): | |||
self.model_pos.eval() | |||
self.model_traj.eval() | |||
def train(self): | |||
self.model_pos.train() | |||
self.model_traj.train() | |||
def to_device(self, device): | |||
self.model_pos = self.model_pos.to(device) | |||
self.model_traj = self.model_traj.to(device) | |||
def load_pretrained(self): | |||
if 'model_pos' in self.pretrained_state_dict: | |||
self.model_pos.load_state_dict( | |||
self.pretrained_state_dict['model_pos'], strict=False) | |||
else: | |||
logging.error( | |||
'Not load model pos from pretrained_state_dict, not in pretrained_state_dict' | |||
) | |||
if 'model_traj' in self.pretrained_state_dict: | |||
self.model_traj.load_state_dict( | |||
self.pretrained_state_dict['model_traj'], strict=False) | |||
else: | |||
logging.error( | |||
'Not load model traj from pretrained_state_dict, not in pretrained_state_dict' | |||
) | |||
logging.info('Load pretrained model done.') | |||
def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
"""Proprocess of 2D input joints. | |||
Args: | |||
input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints. | |||
Returns: | |||
Dict[str, Any]: canonical 2d points and root relative joints. | |||
""" | |||
if 'cuda' == input.device.type: | |||
input = input.data.cpu().numpy() | |||
elif 'cpu' == input.device.type: | |||
input = input.data.numpy() | |||
pose2d = input | |||
pose2d_canonical = self.canonicalize_2Ds( | |||
pose2d, self.cfg.model.INPUT.FOCAL_LENGTH, | |||
self.cfg.model.INPUT.CENTER) | |||
pose2d_normalized = self.normalize_screen_coordinates( | |||
pose2d, self.cfg.model.INPUT.RES_W, self.cfg.model.INPUT.RES_H) | |||
pose2d_rr = pose2d_normalized | |||
pose2d_rr[:, 1:] -= pose2d_rr[:, :1] | |||
# expand [NUM_FRAME, NUM_JOINTS, 2] to [1, NUM_FRAME, NUM_JOINTS, 2] | |||
pose2d_rr = np.expand_dims( | |||
np.pad( | |||
pose2d_rr, | |||
((self.pad + self.causal_shift, self.pad - self.causal_shift), | |||
(0, 0), (0, 0)), 'edge'), | |||
axis=0) | |||
pose2d_canonical = np.expand_dims( | |||
np.pad( | |||
pose2d_canonical, | |||
((self.pad + self.causal_shift, self.pad - self.causal_shift), | |||
(0, 0), (0, 0)), 'edge'), | |||
axis=0) | |||
pose2d_rr = torch.from_numpy(pose2d_rr.astype(np.float32)) | |||
pose2d_canonical = torch.from_numpy( | |||
pose2d_canonical.astype(np.float32)) | |||
inputs_2d = pose2d_rr.clone() | |||
if torch.cuda.is_available(): | |||
inputs_2d = inputs_2d.cuda(non_blocking=True) | |||
# Positional model | |||
if self.cfg.model.MODEL.USE_2D_OFFSETS: | |||
inputs_2d[:, :, 0] = 0 | |||
else: | |||
inputs_2d[:, :, 1:] += inputs_2d[:, :, :1] | |||
return { | |||
'inputs_2d': inputs_2d, | |||
'pose2d_rr': pose2d_rr, | |||
'pose2d_canonical': pose2d_canonical | |||
} | |||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
"""3D human pose estimation. | |||
Args: | |||
input (Dict): | |||
inputs_2d: [1, NUM_FRAME, NUM_JOINTS, 2] | |||
pose2d_rr: [1, NUM_FRAME, NUM_JOINTS, 2] | |||
pose2d_canonical: [1, NUM_FRAME, NUM_JOINTS, 2] | |||
NUM_FRAME = max(receptive_filed + video_frame_number, video_frame_number) | |||
Returns: | |||
Dict[str, Any]: | |||
"camera_pose": Tensor, [1, NUM_FRAME, OUT_NUM_JOINTS, OUT_3D_FEATURE_DIM], | |||
3D human pose keypoints in camera frame. | |||
"camera_traj": Tensor, [1, NUM_FRAME, 1, 3], | |||
root keypoints coordinates in camere frame. | |||
""" | |||
inputs_2d = input['inputs_2d'] | |||
pose2d_rr = input['pose2d_rr'] | |||
pose2d_canonical = input['pose2d_canonical'] | |||
with torch.no_grad(): | |||
# predict 3D pose keypoints | |||
predicted_3d_pos = self.model_pos(inputs_2d) | |||
# predict global trajectory | |||
b1, w1, n1, d1 = inputs_2d.shape | |||
input_pose2d_abs = self.get_abs_2d_pts(w1, pose2d_rr, | |||
pose2d_canonical) | |||
b1, w1, n1, d1 = input_pose2d_abs.size() | |||
b2, w2, n2, d2 = predicted_3d_pos.size() | |||
if torch.cuda.is_available(): | |||
input_pose2d_abs = input_pose2d_abs.cuda(non_blocking=True) | |||
predicted_3d_traj = self.model_traj( | |||
input_pose2d_abs.view(b1, w1, n1 * d1), | |||
predicted_3d_pos.view(b2 * w2, n2 * d2)).view(b2, w2, -1, 3) | |||
predict_dict = { | |||
KeypointsTypes.POSES_CAMERA: predicted_3d_pos, | |||
KeypointsTypes.POSES_TRAJ: predicted_3d_traj | |||
} | |||
return predict_dict | |||
def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr, | |||
pose2d_canonical): | |||
pad = self.pad | |||
w = input_video_frame_num - pad * 2 | |||
lst_pose2d_rr = [] | |||
lst_pose2d_cannoical = [] | |||
for i in range(pad, w + pad): | |||
lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1]) | |||
lst_pose2d_cannoical.append(pose2d_canonical[:, | |||
i - pad:i + pad + 1]) | |||
input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0) | |||
input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0) | |||
if self.cfg.model.MODEL.USE_CANONICAL_COORDS: | |||
input_pose2d_abs = input_pose2d_cannoical.clone() | |||
else: | |||
input_pose2d_abs = input_pose2d_rr.clone() | |||
input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1] | |||
return input_pose2d_abs | |||
def canonicalize_2Ds(self, pos2d, f, c): | |||
cs = np.array([c[0], c[1]]).reshape(1, 1, 2) | |||
fs = np.array([f[0], f[1]]).reshape(1, 1, 2) | |||
canoical_2Ds = (pos2d - cs) / fs | |||
return canoical_2Ds | |||
def normalize_screen_coordinates(self, X, w, h): | |||
assert X.shape[-1] == 2 | |||
# Normalize so that [0, w] is mapped to [-1, 1], while preserving the aspect ratio | |||
return X / w * 2 - [1, h / w] |
@@ -0,0 +1,233 @@ | |||
# The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D | |||
import torch | |||
import torch.nn as nn | |||
class TemporalModelBase(nn.Module): | |||
""" | |||
Do not instantiate this class. | |||
""" | |||
def __init__(self, num_joints_in, in_features, num_joints_out, | |||
filter_widths, causal, dropout, channels): | |||
super().__init__() | |||
# Validate input | |||
for fw in filter_widths: | |||
assert fw % 2 != 0, 'Only odd filter widths are supported' | |||
self.num_joints_in = num_joints_in | |||
self.in_features = in_features | |||
self.num_joints_out = num_joints_out | |||
self.filter_widths = filter_widths | |||
self.drop = nn.Dropout(dropout) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.pad = [filter_widths[0] // 2] | |||
self.expand_bn = nn.BatchNorm1d(channels, momentum=0.1) | |||
self.shrink = nn.Conv1d(channels, num_joints_out * 3, 1) | |||
def set_bn_momentum(self, momentum): | |||
self.expand_bn.momentum = momentum | |||
for bn in self.layers_bn: | |||
bn.momentum = momentum | |||
def receptive_field(self): | |||
""" | |||
Return the total receptive field of this model as # of frames. | |||
""" | |||
frames = 0 | |||
for f in self.pad: | |||
frames += f | |||
return 1 + 2 * frames | |||
def total_causal_shift(self): | |||
""" | |||
Return the asymmetric offset for sequence padding. | |||
The returned value is typically 0 if causal convolutions are disabled, | |||
otherwise it is half the receptive field. | |||
""" | |||
frames = self.causal_shift[0] | |||
next_dilation = self.filter_widths[0] | |||
for i in range(1, len(self.filter_widths)): | |||
frames += self.causal_shift[i] * next_dilation | |||
next_dilation *= self.filter_widths[i] | |||
return frames | |||
def forward(self, x): | |||
assert len(x.shape) == 4 | |||
assert x.shape[-2] == self.num_joints_in | |||
assert x.shape[-1] == self.in_features | |||
sz = x.shape[:3] | |||
x = x.view(x.shape[0], x.shape[1], -1) | |||
x = x.permute(0, 2, 1) | |||
x = self._forward_blocks(x) | |||
x = x.permute(0, 2, 1) | |||
x = x.view(sz[0], -1, self.num_joints_out, 3) | |||
return x | |||
class TemporalModel(TemporalModelBase): | |||
""" | |||
Reference 3D pose estimation model with temporal convolutions. | |||
This implementation can be used for all use-cases. | |||
""" | |||
def __init__(self, | |||
num_joints_in, | |||
in_features, | |||
num_joints_out, | |||
filter_widths, | |||
causal=False, | |||
dropout=0.25, | |||
channels=1024, | |||
dense=False): | |||
""" | |||
Initialize this model. | |||
Arguments: | |||
num_joints_in -- number of input joints (e.g. 17 for Human3.6M) | |||
in_features -- number of input features for each joint (typically 2 for 2D input) | |||
num_joints_out -- number of output joints (can be different than input) | |||
filter_widths -- list of convolution widths, which also determines the # of blocks and receptive field | |||
causal -- use causal convolutions instead of symmetric convolutions (for real-time applications) | |||
dropout -- dropout probability | |||
channels -- number of convolution channels | |||
dense -- use regular dense convolutions instead of dilated convolutions (ablation experiment) | |||
""" | |||
super().__init__(num_joints_in, in_features, num_joints_out, | |||
filter_widths, causal, dropout, channels) | |||
self.expand_conv = nn.Conv1d( | |||
num_joints_in * in_features, | |||
channels, | |||
filter_widths[0], | |||
bias=False) | |||
layers_conv = [] | |||
layers_bn = [] | |||
self.causal_shift = [(filter_widths[0]) // 2 if causal else 0] | |||
next_dilation = filter_widths[0] | |||
for i in range(1, len(filter_widths)): | |||
self.pad.append((filter_widths[i] - 1) * next_dilation // 2) | |||
self.causal_shift.append((filter_widths[i] // 2 | |||
* next_dilation) if causal else 0) | |||
layers_conv.append( | |||
nn.Conv1d( | |||
channels, | |||
channels, | |||
filter_widths[i] if not dense else (2 * self.pad[-1] + 1), | |||
dilation=next_dilation if not dense else 1, | |||
bias=False)) | |||
layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1)) | |||
layers_conv.append( | |||
nn.Conv1d(channels, channels, 1, dilation=1, bias=False)) | |||
layers_bn.append(nn.BatchNorm1d(channels, momentum=0.1)) | |||
next_dilation *= filter_widths[i] | |||
self.layers_conv = nn.ModuleList(layers_conv) | |||
self.layers_bn = nn.ModuleList(layers_bn) | |||
def _forward_blocks(self, x): | |||
x = self.drop(self.relu(self.expand_bn(self.expand_conv(x)))) | |||
for i in range(len(self.pad) - 1): | |||
pad = self.pad[i + 1] | |||
shift = self.causal_shift[i + 1] | |||
res = x[:, :, pad + shift:x.shape[2] - pad + shift] | |||
x = self.drop( | |||
self.relu(self.layers_bn[2 * i](self.layers_conv[2 * i](x)))) | |||
x = res + self.drop( | |||
self.relu(self.layers_bn[2 * i + 1]( | |||
self.layers_conv[2 * i + 1](x)))) | |||
x = self.shrink(x) | |||
return x | |||
# regression of the trajectory | |||
class TransCan3Dkeys(nn.Module): | |||
def __init__(self, | |||
in_channels=74, | |||
num_features=256, | |||
out_channels=44, | |||
time_window=10, | |||
num_blocks=2): | |||
super().__init__() | |||
self.in_channels = in_channels | |||
self.num_features = num_features | |||
self.out_channels = out_channels | |||
self.num_blocks = num_blocks | |||
self.time_window = time_window | |||
self.expand_bn = nn.BatchNorm1d(self.num_features, momentum=0.1) | |||
self.conv1 = nn.Sequential( | |||
nn.ReplicationPad1d(1), | |||
nn.Conv1d( | |||
self.in_channels, self.num_features, kernel_size=3, | |||
bias=False), self.expand_bn, nn.ReLU(inplace=True), | |||
nn.Dropout(p=0.25)) | |||
self._make_blocks() | |||
self.pad = nn.ReplicationPad1d(4) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.drop = nn.Dropout(p=0.25) | |||
self.reduce = nn.Conv1d( | |||
self.num_features, self.num_features, kernel_size=self.time_window) | |||
self.embedding_3d_1 = nn.Linear(in_channels // 2 * 3, 500) | |||
self.embedding_3d_2 = nn.Linear(500, 500) | |||
self.LReLU1 = nn.LeakyReLU() | |||
self.LReLU2 = nn.LeakyReLU() | |||
self.LReLU3 = nn.LeakyReLU() | |||
self.out1 = nn.Linear(self.num_features + 500, self.num_features) | |||
self.out2 = nn.Linear(self.num_features, self.out_channels) | |||
def _make_blocks(self): | |||
layers_conv = [] | |||
layers_bn = [] | |||
for i in range(self.num_blocks): | |||
layers_conv.append( | |||
nn.Conv1d( | |||
self.num_features, | |||
self.num_features, | |||
kernel_size=5, | |||
bias=False, | |||
dilation=2)) | |||
layers_bn.append(nn.BatchNorm1d(self.num_features)) | |||
self.layers_conv = nn.ModuleList(layers_conv) | |||
self.layers_bn = nn.ModuleList(layers_bn) | |||
def set_bn_momentum(self, momentum): | |||
self.expand_bn.momentum = momentum | |||
for bn in self.layers_bn: | |||
bn.momentum = momentum | |||
def forward(self, p2ds, p3d): | |||
""" | |||
Args: | |||
x - (B x T x J x C) | |||
""" | |||
B, T, C = p2ds.shape | |||
x = p2ds.permute((0, 2, 1)) | |||
x = self.conv1(x) | |||
for i in range(self.num_blocks): | |||
pre = x | |||
x = self.pad(x) | |||
x = self.layers_conv[i](x) | |||
x = self.layers_bn[i](x) | |||
x = self.drop(self.relu(x)) | |||
x = pre + x | |||
x_2d = self.relu(self.reduce(x)) | |||
x_2d = x_2d.view(B, -1) | |||
x_3d = self.LReLU1(self.embedding_3d_1(p3d)) | |||
x = torch.cat((x_2d, x_3d), 1) | |||
x = self.LReLU3(self.out1(x)) | |||
x = self.out2(x) | |||
return x |
@@ -13,8 +13,8 @@ from modelscope.utils.constant import Tasks | |||
Tasks.crowd_counting, module_name=Models.crowd_counting) | |||
class HRNetCrowdCounting(TorchModel): | |||
def __init__(self, model_dir: str): | |||
super().__init__(model_dir) | |||
def __init__(self, model_dir: str, **kwargs): | |||
super().__init__(model_dir, **kwargs) | |||
from .hrnet_aspp_relu import HighResolutionNet as HRNet_aspp_relu | |||
@@ -0,0 +1,25 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from easycv.models.base import BaseModel | |||
from easycv.utils.ms_utils import EasyCVMeta | |||
from modelscope.models.base import TorchModel | |||
class EasyCVBaseModel(BaseModel, TorchModel): | |||
"""Base model for EasyCV.""" | |||
def __init__(self, model_dir=None, args=(), kwargs={}): | |||
kwargs.pop(EasyCVMeta.ARCH, None) # pop useless keys | |||
BaseModel.__init__(self) | |||
TorchModel.__init__(self, model_dir=model_dir) | |||
def forward(self, img, mode='train', **kwargs): | |||
if self.training: | |||
losses = self.forward_train(img, **kwargs) | |||
loss, log_vars = self._parse_losses(losses) | |||
return dict(loss=loss, log_vars=log_vars) | |||
else: | |||
return self.forward_test(img, **kwargs) | |||
def __call__(self, *args, **kwargs): | |||
return self.forward(*args, **kwargs) |
@@ -10,7 +10,7 @@ from modelscope.utils.constant import Tasks | |||
Tasks.image_classification, module_name=Models.classification_model) | |||
class ClassificationModel(TorchModel): | |||
def __init__(self, model_dir: str): | |||
def __init__(self, model_dir: str, **kwargs): | |||
import mmcv | |||
from mmcls.models import build_classifier | |||
@@ -0,0 +1,22 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .panseg_model import SwinLPanopticSegmentation | |||
else: | |||
_import_structure = { | |||
'panseg_model': ['SwinLPanopticSegmentation'], | |||
} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,54 @@ | |||
import os.path as osp | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
@MODELS.register_module( | |||
Tasks.image_segmentation, module_name=Models.panoptic_segmentation) | |||
class SwinLPanopticSegmentation(TorchModel): | |||
def __init__(self, model_dir: str, **kwargs): | |||
"""str -- model file root.""" | |||
super().__init__(model_dir, **kwargs) | |||
from mmcv.runner import load_checkpoint | |||
import mmcv | |||
from mmdet.models import build_detector | |||
config = osp.join(model_dir, 'config.py') | |||
cfg = mmcv.Config.fromfile(config) | |||
if 'pretrained' in cfg.model: | |||
cfg.model.pretrained = None | |||
elif 'init_cfg' in cfg.model.backbone: | |||
cfg.model.backbone.init_cfg = None | |||
# build model | |||
cfg.model.train_cfg = None | |||
self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) | |||
# load model | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
checkpoint = load_checkpoint( | |||
self.model, model_path, map_location='cpu') | |||
self.CLASSES = checkpoint['meta']['CLASSES'] | |||
self.num_classes = len(self.CLASSES) | |||
self.cfg = cfg | |||
def inference(self, data): | |||
"""data is dict,contain img and img_metas,follow with mmdet.""" | |||
with torch.no_grad(): | |||
results = self.model(return_loss=False, rescale=True, **data) | |||
return results | |||
def forward(self, Inputs): | |||
import pdb | |||
pdb.set_trace() | |||
return self.model(**Inputs) |
@@ -0,0 +1,22 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .pass_model import PASS | |||
else: | |||
_import_structure = { | |||
'pass_model': ['PASS'], | |||
} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,136 @@ | |||
# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on | |||
# https://github.com/CASIA-IVA-Lab/PASS-reID | |||
import os | |||
from enum import Enum | |||
import torch | |||
import torch.nn as nn | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .transreid_model import vit_base_patch16_224_TransReID | |||
class Fusions(Enum): | |||
CAT = 'cat' | |||
MEAN = 'mean' | |||
@MODELS.register_module( | |||
Tasks.image_reid_person, module_name=Models.image_reid_person) | |||
class PASS(TorchModel): | |||
def __init__(self, cfg: Config, model_dir: str, **kwargs): | |||
super(PASS, self).__init__(model_dir=model_dir) | |||
size_train = cfg.INPUT.SIZE_TRAIN | |||
sie_coe = cfg.MODEL.SIE_COE | |||
stride_size = cfg.MODEL.STRIDE_SIZE | |||
drop_path = cfg.MODEL.DROP_PATH | |||
drop_out = cfg.MODEL.DROP_OUT | |||
att_drop_rate = cfg.MODEL.ATT_DROP_RATE | |||
gem_pooling = cfg.MODEL.GEM_POOLING | |||
stem_conv = cfg.MODEL.STEM_CONV | |||
weight = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
self.neck_feat = cfg.TEST.NECK_FEAT | |||
self.dropout_rate = cfg.MODEL.DROPOUT_RATE | |||
self.num_classes = cfg.DATASETS.NUM_CLASSES | |||
self.multi_neck = cfg.MODEL.MULTI_NECK | |||
self.feat_fusion = cfg.MODEL.FEAT_FUSION | |||
self.base = vit_base_patch16_224_TransReID( | |||
img_size=size_train, | |||
sie_xishu=sie_coe, | |||
stride_size=stride_size, | |||
drop_path_rate=drop_path, | |||
drop_rate=drop_out, | |||
attn_drop_rate=att_drop_rate, | |||
gem_pool=gem_pooling, | |||
stem_conv=stem_conv) | |||
self.in_planes = self.base.in_planes | |||
if self.feat_fusion == Fusions.CAT.value: | |||
self.classifier = nn.Linear( | |||
self.in_planes * 2, self.num_classes, bias=False) | |||
elif self.feat_fusion == Fusions.MEAN.value: | |||
self.classifier = nn.Linear( | |||
self.in_planes, self.num_classes, bias=False) | |||
if self.multi_neck: | |||
self.bottleneck = nn.BatchNorm1d(self.in_planes) | |||
self.bottleneck.bias.requires_grad_(False) | |||
self.bottleneck_1 = nn.BatchNorm1d(self.in_planes) | |||
self.bottleneck_1.bias.requires_grad_(False) | |||
self.bottleneck_2 = nn.BatchNorm1d(self.in_planes) | |||
self.bottleneck_2.bias.requires_grad_(False) | |||
self.bottleneck_3 = nn.BatchNorm1d(self.in_planes) | |||
self.bottleneck_3.bias.requires_grad_(False) | |||
else: | |||
if self.feat_fusion == Fusions.CAT.value: | |||
self.bottleneck = nn.BatchNorm1d(self.in_planes * 2) | |||
self.bottleneck.bias.requires_grad_(False) | |||
elif self.feat_fusion == Fusions.MEAN.value: | |||
self.bottleneck = nn.BatchNorm1d(self.in_planes) | |||
self.bottleneck.bias.requires_grad_(False) | |||
self.dropout = nn.Dropout(self.dropout_rate) | |||
self.load_param(weight) | |||
def forward(self, input): | |||
global_feat, local_feat_1, local_feat_2, local_feat_3 = self.base( | |||
input) | |||
# single-neck, almost the same performance | |||
if not self.multi_neck: | |||
if self.feat_fusion == Fusions.MEAN.value: | |||
local_feat = local_feat_1 / 3. + local_feat_2 / 3. + local_feat_3 / 3. | |||
final_feat_before = (global_feat + local_feat) / 2 | |||
elif self.feat_fusion == Fusions.CAT.value: | |||
final_feat_before = torch.cat( | |||
(global_feat, local_feat_1 / 3. + local_feat_2 / 3. | |||
+ local_feat_3 / 3.), | |||
dim=1) | |||
final_feat_after = self.bottleneck(final_feat_before) | |||
# multi-neck | |||
else: | |||
feat = self.bottleneck(global_feat) | |||
local_feat_1_bn = self.bottleneck_1(local_feat_1) | |||
local_feat_2_bn = self.bottleneck_2(local_feat_2) | |||
local_feat_3_bn = self.bottleneck_3(local_feat_3) | |||
if self.feat_fusion == Fusions.MEAN.value: | |||
final_feat_before = ((global_feat + local_feat_1 / 3 | |||
+ local_feat_2 / 3 + local_feat_3 / 3) | |||
/ 2.) | |||
final_feat_after = (feat + local_feat_1_bn / 3 | |||
+ local_feat_2_bn / 3 | |||
+ local_feat_3_bn / 3) / 2. | |||
elif self.feat_fusion == Fusions.CAT.value: | |||
final_feat_before = torch.cat( | |||
(global_feat, local_feat_1 / 3. + local_feat_2 / 3. | |||
+ local_feat_3 / 3.), | |||
dim=1) | |||
final_feat_after = torch.cat( | |||
(feat, local_feat_1_bn / 3 + local_feat_2_bn / 3 | |||
+ local_feat_3_bn / 3), | |||
dim=1) | |||
if self.neck_feat == 'after': | |||
return final_feat_after | |||
else: | |||
return final_feat_before | |||
def load_param(self, trained_path): | |||
param_dict = torch.load(trained_path, map_location='cpu') | |||
for i in param_dict: | |||
try: | |||
self.state_dict()[i.replace('module.', | |||
'')].copy_(param_dict[i]) | |||
except Exception: | |||
continue |
@@ -0,0 +1,418 @@ | |||
# The implementation is also open-sourced by the authors as PASS-reID, and is available publicly on | |||
# https://github.com/CASIA-IVA-Lab/PASS-reID | |||
import collections.abc as container_abcs | |||
from functools import partial | |||
from itertools import repeat | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
# From PyTorch internals | |||
def _ntuple(n): | |||
def parse(x): | |||
if isinstance(x, container_abcs.Iterable): | |||
return x | |||
return tuple(repeat(x, n)) | |||
return parse | |||
to_2tuple = _ntuple(2) | |||
def vit_base_patch16_224_TransReID( | |||
img_size=(256, 128), | |||
stride_size=16, | |||
drop_path_rate=0.1, | |||
camera=0, | |||
view=0, | |||
local_feature=False, | |||
sie_xishu=1.5, | |||
**kwargs): | |||
model = TransReID( | |||
img_size=img_size, | |||
patch_size=16, | |||
stride_size=stride_size, | |||
embed_dim=768, | |||
depth=12, | |||
num_heads=12, | |||
mlp_ratio=4, | |||
qkv_bias=True, | |||
camera=camera, | |||
view=view, | |||
drop_path_rate=drop_path_rate, | |||
sie_xishu=sie_xishu, | |||
local_feature=local_feature, | |||
**kwargs) | |||
return model | |||
def drop_path(x, drop_prob: float = 0., training: bool = False): | |||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). | |||
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, | |||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... | |||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for | |||
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use | |||
'survival rate' as the argument. | |||
""" | |||
if drop_prob == 0. or not training: | |||
return x | |||
keep_prob = 1 - drop_prob | |||
shape = (x.shape[0], ) + (1, ) * ( | |||
x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets | |||
random_tensor = keep_prob + torch.rand( | |||
shape, dtype=x.dtype, device=x.device) | |||
random_tensor.floor_() # binarize | |||
output = x.div(keep_prob) * random_tensor | |||
return output | |||
class TransReID(nn.Module): | |||
"""Transformer-based Object Re-Identification | |||
""" | |||
def __init__(self, | |||
img_size=224, | |||
patch_size=16, | |||
stride_size=16, | |||
in_chans=3, | |||
num_classes=1000, | |||
embed_dim=768, | |||
depth=12, | |||
num_heads=12, | |||
mlp_ratio=4., | |||
qkv_bias=False, | |||
qk_scale=None, | |||
drop_rate=0., | |||
attn_drop_rate=0., | |||
camera=0, | |||
view=0, | |||
drop_path_rate=0., | |||
norm_layer=partial(nn.LayerNorm, eps=1e-6), | |||
local_feature=False, | |||
sie_xishu=1.0, | |||
hw_ratio=1, | |||
gem_pool=False, | |||
stem_conv=False): | |||
super().__init__() | |||
self.num_classes = num_classes | |||
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models | |||
self.local_feature = local_feature | |||
self.patch_embed = PatchEmbed( | |||
img_size=img_size, | |||
patch_size=patch_size, | |||
stride_size=stride_size, | |||
in_chans=in_chans, | |||
embed_dim=embed_dim, | |||
stem_conv=stem_conv) | |||
num_patches = self.patch_embed.num_patches | |||
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.part_token1 = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.part_token2 = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.part_token3 = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.cls_pos = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.part1_pos = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.part2_pos = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.part3_pos = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) | |||
self.cam_num = camera | |||
self.view_num = view | |||
self.sie_xishu = sie_xishu | |||
self.in_planes = 768 | |||
self.gem_pool = gem_pool | |||
# Initialize SIE Embedding | |||
if camera > 1 and view > 1: | |||
self.sie_embed = nn.Parameter( | |||
torch.zeros(camera * view, 1, embed_dim)) | |||
elif camera > 1: | |||
self.sie_embed = nn.Parameter(torch.zeros(camera, 1, embed_dim)) | |||
elif view > 1: | |||
self.sie_embed = nn.Parameter(torch.zeros(view, 1, embed_dim)) | |||
self.pos_drop = nn.Dropout(p=drop_rate) | |||
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) | |||
] # stochastic depth decay rule | |||
self.blocks = nn.ModuleList([ | |||
Block( | |||
dim=embed_dim, | |||
num_heads=num_heads, | |||
mlp_ratio=mlp_ratio, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
drop=drop_rate, | |||
attn_drop=attn_drop_rate, | |||
drop_path=dpr[i], | |||
norm_layer=norm_layer) for i in range(depth) | |||
]) | |||
self.norm = norm_layer(embed_dim) | |||
# Classifier head | |||
self.fc = nn.Linear(embed_dim, | |||
num_classes) if num_classes > 0 else nn.Identity() | |||
self.gem = GeneralizedMeanPooling() | |||
def forward_features(self, x, camera_id, view_id): | |||
B = x.shape[0] | |||
x = self.patch_embed(x) | |||
cls_tokens = self.cls_token.expand( | |||
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks | |||
part_tokens1 = self.part_token1.expand(B, -1, -1) | |||
part_tokens2 = self.part_token2.expand(B, -1, -1) | |||
part_tokens3 = self.part_token3.expand(B, -1, -1) | |||
x = torch.cat( | |||
(cls_tokens, part_tokens1, part_tokens2, part_tokens3, x), dim=1) | |||
if self.cam_num > 0 and self.view_num > 0: | |||
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[ | |||
camera_id * self.view_num + view_id] | |||
elif self.cam_num > 0: | |||
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[camera_id] | |||
elif self.view_num > 0: | |||
x = x + self.pos_embed + self.sie_xishu * self.sie_embed[view_id] | |||
else: | |||
x = x + torch.cat((self.cls_pos, self.part1_pos, self.part2_pos, | |||
self.part3_pos, self.pos_embed), | |||
dim=1) | |||
x = self.pos_drop(x) | |||
if self.local_feature: | |||
for blk in self.blocks[:-1]: | |||
x = blk(x) | |||
return x | |||
else: | |||
for blk in self.blocks: | |||
x = blk(x) | |||
x = self.norm(x) | |||
if self.gem_pool: | |||
gf = self.gem(x[:, 1:].permute(0, 2, 1)).squeeze() | |||
return x[:, 0] + gf | |||
return x[:, 0], x[:, 1], x[:, 2], x[:, 3] | |||
def forward(self, x, cam_label=None, view_label=None): | |||
global_feat, local_feat_1, local_feat_2, local_feat_3 = self.forward_features( | |||
x, cam_label, view_label) | |||
return global_feat, local_feat_1, local_feat_2, local_feat_3 | |||
class PatchEmbed(nn.Module): | |||
"""Image to Patch Embedding with overlapping patches | |||
""" | |||
def __init__(self, | |||
img_size=224, | |||
patch_size=16, | |||
stride_size=16, | |||
in_chans=3, | |||
embed_dim=768, | |||
stem_conv=False): | |||
super().__init__() | |||
img_size = to_2tuple(img_size) | |||
patch_size = to_2tuple(patch_size) | |||
stride_size_tuple = to_2tuple(stride_size) | |||
self.num_x = (img_size[1] - patch_size[1]) // stride_size_tuple[1] + 1 | |||
self.num_y = (img_size[0] - patch_size[0]) // stride_size_tuple[0] + 1 | |||
self.num_patches = self.num_x * self.num_y | |||
self.img_size = img_size | |||
self.patch_size = patch_size | |||
self.stem_conv = stem_conv | |||
if self.stem_conv: | |||
hidden_dim = 64 | |||
stem_stride = 2 | |||
stride_size = patch_size = patch_size[0] // stem_stride | |||
self.conv = nn.Sequential( | |||
nn.Conv2d( | |||
in_chans, | |||
hidden_dim, | |||
kernel_size=7, | |||
stride=stem_stride, | |||
padding=3, | |||
bias=False), | |||
IBN(hidden_dim), | |||
nn.ReLU(inplace=True), | |||
nn.Conv2d( | |||
hidden_dim, | |||
hidden_dim, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1, | |||
bias=False), | |||
IBN(hidden_dim), | |||
nn.ReLU(inplace=True), | |||
nn.Conv2d( | |||
hidden_dim, | |||
hidden_dim, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1, | |||
bias=False), | |||
nn.BatchNorm2d(hidden_dim), | |||
nn.ReLU(inplace=True), | |||
) | |||
in_chans = hidden_dim | |||
self.proj = nn.Conv2d( | |||
in_chans, embed_dim, kernel_size=patch_size, stride=stride_size) | |||
def forward(self, x): | |||
if self.stem_conv: | |||
x = self.conv(x) | |||
x = self.proj(x) | |||
x = x.flatten(2).transpose(1, 2) # [64, 8, 768] | |||
return x | |||
class GeneralizedMeanPooling(nn.Module): | |||
"""Applies a 2D power-average adaptive pooling over an input signal composed of several input planes. | |||
The function computed is: :math:`f(X) = pow(sum(pow(X, p)), 1/p)` | |||
- At p = infinity, one gets Max Pooling | |||
- At p = 1, one gets Average Pooling | |||
The output is of size H x W, for any input size. | |||
The number of output features is equal to the number of input planes. | |||
Args: | |||
output_size: the target output size of the image of the form H x W. | |||
Can be a tuple (H, W) or a single H for a square image H x H | |||
H and W can be either a ``int``, or ``None`` which means the size will | |||
be the same as that of the input. | |||
""" | |||
def __init__(self, norm=3, output_size=1, eps=1e-6): | |||
super(GeneralizedMeanPooling, self).__init__() | |||
assert norm > 0 | |||
self.p = float(norm) | |||
self.output_size = output_size | |||
self.eps = eps | |||
def forward(self, x): | |||
x = x.clamp(min=self.eps).pow(self.p) | |||
return F.adaptive_avg_pool1d(x, self.output_size).pow(1. / self.p) | |||
class Block(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads, | |||
mlp_ratio=4., | |||
qkv_bias=False, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
act_layer=nn.GELU, | |||
norm_layer=nn.LayerNorm): | |||
super().__init__() | |||
self.norm1 = norm_layer(dim) | |||
self.attn = Attention( | |||
dim, | |||
num_heads=num_heads, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
attn_drop=attn_drop, | |||
proj_drop=drop) | |||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here | |||
self.drop_path = DropPath( | |||
drop_path) if drop_path > 0. else nn.Identity() | |||
self.norm2 = norm_layer(dim) | |||
mlp_hidden_dim = int(dim * mlp_ratio) | |||
self.mlp = Mlp( | |||
in_features=dim, | |||
hidden_features=mlp_hidden_dim, | |||
act_layer=act_layer, | |||
drop=drop) | |||
def forward(self, x): | |||
x = x + self.drop_path(self.attn(self.norm1(x))) | |||
x = x + self.drop_path(self.mlp(self.norm2(x))) | |||
return x | |||
class Attention(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads=8, | |||
qkv_bias=False, | |||
qk_scale=None, | |||
attn_drop=0., | |||
proj_drop=0.): | |||
super().__init__() | |||
self.num_heads = num_heads | |||
head_dim = dim // num_heads | |||
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights | |||
self.scale = qk_scale or head_dim**-0.5 | |||
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) | |||
self.attn_drop = nn.Dropout(attn_drop) | |||
self.proj = nn.Linear(dim, dim) | |||
self.proj_drop = nn.Dropout(proj_drop) | |||
def forward(self, x): | |||
B, N, C = x.shape | |||
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, | |||
C // self.num_heads).permute(2, 0, 3, 1, 4) | |||
q, k, v = qkv[0], qkv[1], qkv[ | |||
2] # make torchscript happy (cannot use tensor as tuple) | |||
attn = (q @ k.transpose(-2, -1)) * self.scale | |||
attn = attn.softmax(dim=-1) | |||
attn = self.attn_drop(attn) | |||
x = (attn @ v).transpose(1, 2).reshape(B, N, C) | |||
x = self.proj(x) | |||
x = self.proj_drop(x) | |||
return x | |||
class DropPath(nn.Module): | |||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). | |||
""" | |||
def __init__(self, drop_prob=None): | |||
super(DropPath, self).__init__() | |||
self.drop_prob = drop_prob | |||
def forward(self, x): | |||
return drop_path(x, self.drop_prob, self.training) | |||
class Mlp(nn.Module): | |||
def __init__(self, | |||
in_features, | |||
hidden_features=None, | |||
out_features=None, | |||
act_layer=nn.GELU, | |||
drop=0.): | |||
super().__init__() | |||
out_features = out_features or in_features | |||
hidden_features = hidden_features or in_features | |||
self.fc1 = nn.Linear(in_features, hidden_features) | |||
self.act = act_layer() | |||
self.fc2 = nn.Linear(hidden_features, out_features) | |||
self.drop = nn.Dropout(drop) | |||
def forward(self, x): | |||
x = self.fc1(x) | |||
x = self.act(x) | |||
x = self.drop(x) | |||
x = self.fc2(x) | |||
x = self.drop(x) | |||
return x |
@@ -0,0 +1,24 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .semantic_seg_model import SemanticSegmentation | |||
from .segformer import Segformer | |||
else: | |||
_import_structure = { | |||
'semantic_seg_model': ['SemanticSegmentation'], | |||
'segformer': ['Segformer'] | |||
} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1 @@ | |||
from .maskformer_semantic_head import MaskFormerSemanticHead |
@@ -0,0 +1,47 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
from abc import ABCMeta, abstractmethod | |||
from mmcv.runner import BaseModule | |||
from mmdet.models.builder import build_loss | |||
class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta): | |||
"""Base class for panoptic heads.""" | |||
def __init__(self, | |||
num_things_classes=80, | |||
num_stuff_classes=53, | |||
test_cfg=None, | |||
loss_panoptic=None, | |||
init_cfg=None, | |||
**kwargs): | |||
super(BasePanopticFusionHead, self).__init__(init_cfg) | |||
self.num_things_classes = num_things_classes | |||
self.num_stuff_classes = num_stuff_classes | |||
self.num_classes = num_things_classes + num_stuff_classes | |||
self.test_cfg = test_cfg | |||
if loss_panoptic: | |||
self.loss_panoptic = build_loss(loss_panoptic) | |||
else: | |||
self.loss_panoptic = None | |||
@property | |||
def with_loss(self): | |||
"""bool: whether the panoptic head contains loss function.""" | |||
return self.loss_panoptic is not None | |||
@abstractmethod | |||
def forward_train(self, gt_masks=None, gt_semantic_seg=None, **kwargs): | |||
"""Forward function during training.""" | |||
@abstractmethod | |||
def simple_test(self, | |||
img_metas, | |||
det_labels, | |||
mask_preds, | |||
seg_preds, | |||
det_bboxes, | |||
cfg=None, | |||
**kwargs): | |||
"""Test without augmentation.""" |
@@ -0,0 +1,57 @@ | |||
import torch | |||
import torch.nn.functional as F | |||
from mmdet.models.builder import HEADS | |||
from .base_panoptic_fusion_head import BasePanopticFusionHead | |||
@HEADS.register_module() | |||
class MaskFormerSemanticHead(BasePanopticFusionHead): | |||
def __init__(self, | |||
num_things_classes=80, | |||
num_stuff_classes=53, | |||
test_cfg=None, | |||
loss_panoptic=None, | |||
init_cfg=None, | |||
**kwargs): | |||
super().__init__(num_things_classes, num_stuff_classes, test_cfg, | |||
loss_panoptic, init_cfg, **kwargs) | |||
def forward_train(self, **kwargs): | |||
"""MaskFormerFusionHead has no training loss.""" | |||
return dict() | |||
def simple_test(self, | |||
mask_cls_results, | |||
mask_pred_results, | |||
img_metas, | |||
rescale=False, | |||
**kwargs): | |||
results = [] | |||
for mask_cls_result, mask_pred_result, meta in zip( | |||
mask_cls_results, mask_pred_results, img_metas): | |||
# remove padding | |||
img_height, img_width = meta['img_shape'][:2] | |||
mask_pred_result = mask_pred_result[:, :img_height, :img_width] | |||
if rescale: | |||
# return result in original resolution | |||
ori_height, ori_width = meta['ori_shape'][:2] | |||
mask_pred_result = F.interpolate( | |||
mask_pred_result[:, None], | |||
size=(ori_height, ori_width), | |||
mode='bilinear', | |||
align_corners=False)[:, 0] | |||
# semantic inference | |||
cls_score = F.softmax(mask_cls_result, dim=-1)[..., :-1] | |||
mask_pred = mask_pred_result.sigmoid() | |||
seg_mask = torch.einsum('qc,qhw->chw', cls_score, mask_pred) | |||
# still need softmax and argmax | |||
seg_logit = F.softmax(seg_mask, dim=0) | |||
seg_pred = seg_logit.argmax(dim=0) | |||
seg_pred = seg_pred.cpu().numpy() | |||
results.append(seg_pred) | |||
return results |
@@ -0,0 +1,16 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from easycv.models.segmentation import EncoderDecoder | |||
from modelscope.metainfo import Models | |||
from modelscope.models.builder import MODELS | |||
from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||
from modelscope.utils.constant import Tasks | |||
@MODELS.register_module( | |||
group_key=Tasks.image_segmentation, module_name=Models.segformer) | |||
class Segformer(EasyCVBaseModel, EncoderDecoder): | |||
def __init__(self, model_dir=None, *args, **kwargs): | |||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||
EncoderDecoder.__init__(self, *args, **kwargs) |
@@ -0,0 +1,76 @@ | |||
import os.path as osp | |||
import numpy as np | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.models.cv.image_semantic_segmentation import (pan_merge, | |||
vit_adapter) | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
@MODELS.register_module( | |||
Tasks.image_segmentation, module_name=Models.swinL_semantic_segmentation) | |||
@MODELS.register_module( | |||
Tasks.image_segmentation, | |||
module_name=Models.vitadapter_semantic_segmentation) | |||
class SemanticSegmentation(TorchModel): | |||
def __init__(self, model_dir: str, **kwargs): | |||
"""str -- model file root.""" | |||
super().__init__(model_dir, **kwargs) | |||
from mmcv.runner import load_checkpoint | |||
import mmcv | |||
from mmdet.models import build_detector | |||
config = osp.join(model_dir, 'mmcv_config.py') | |||
cfg = mmcv.Config.fromfile(config) | |||
if 'pretrained' in cfg.model: | |||
cfg.model.pretrained = None | |||
elif 'init_cfg' in cfg.model.backbone: | |||
cfg.model.backbone.init_cfg = None | |||
# build model | |||
cfg.model.train_cfg = None | |||
self.model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) | |||
# load model | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
_ = load_checkpoint(self.model, model_path, map_location='cpu') | |||
self.CLASSES = cfg['CLASSES'] # list | |||
self.PALETTE = cfg['PALETTE'] # list | |||
self.num_classes = len(self.CLASSES) | |||
self.cfg = cfg | |||
def forward(self, Inputs): | |||
return self.model(**Inputs) | |||
def postprocess(self, Inputs): | |||
semantic_result = Inputs[0] | |||
ids = np.unique(semantic_result)[::-1] | |||
legal_indices = ids != self.model.num_classes # for VOID label | |||
ids = ids[legal_indices] | |||
segms = (semantic_result[None] == ids[:, None, None]) | |||
masks = [it.astype(np.int) for it in segms] | |||
labels_txt = np.array(self.CLASSES)[ids].tolist() | |||
results = { | |||
OutputKeys.MASKS: masks, | |||
OutputKeys.LABELS: labels_txt, | |||
OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))] | |||
} | |||
return results | |||
def inference(self, data): | |||
with torch.no_grad(): | |||
results = self.model(return_loss=False, rescale=True, **data) | |||
return results |
@@ -0,0 +1,3 @@ | |||
from .models import backbone, decode_heads, segmentors | |||
from .utils import (ResizeToMultiple, add_prefix, build_pixel_sampler, | |||
seg_resize) |
@@ -0,0 +1,3 @@ | |||
from .backbone import BASEBEiT, BEiTAdapter | |||
from .decode_heads import Mask2FormerHeadFromMMSeg | |||
from .segmentors import EncoderDecoderMask2Former |
@@ -0,0 +1,4 @@ | |||
from .base import BASEBEiT | |||
from .beit_adapter import BEiTAdapter | |||
__all__ = ['BEiTAdapter', 'BASEBEiT'] |
@@ -0,0 +1,523 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import logging | |||
from functools import partial | |||
import torch | |||
import torch.nn as nn | |||
import torch.utils.checkpoint as cp | |||
from mmdet.models.utils.transformer import MultiScaleDeformableAttention | |||
from timm.models.layers import DropPath | |||
_logger = logging.getLogger(__name__) | |||
def get_reference_points(spatial_shapes, device): | |||
reference_points_list = [] | |||
for lvl, (H_, W_) in enumerate(spatial_shapes): | |||
ref_y, ref_x = torch.meshgrid( | |||
torch.linspace( | |||
0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), | |||
torch.linspace( | |||
0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) | |||
ref_y = ref_y.reshape(-1)[None] / H_ | |||
ref_x = ref_x.reshape(-1)[None] / W_ | |||
ref = torch.stack((ref_x, ref_y), -1) | |||
reference_points_list.append(ref) | |||
reference_points = torch.cat(reference_points_list, 1) | |||
reference_points = reference_points[:, :, None] | |||
return reference_points | |||
def deform_inputs(x): | |||
bs, c, h, w = x.shape | |||
spatial_shapes = torch.as_tensor([(h // 8, w // 8), (h // 16, w // 16), | |||
(h // 32, w // 32)], | |||
dtype=torch.long, | |||
device=x.device) | |||
level_start_index = torch.cat((spatial_shapes.new_zeros( | |||
(1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) | |||
reference_points = get_reference_points([(h // 16, w // 16)], x.device) | |||
deform_inputs1 = [reference_points, spatial_shapes, level_start_index] | |||
spatial_shapes = torch.as_tensor([(h // 16, w // 16)], | |||
dtype=torch.long, | |||
device=x.device) | |||
level_start_index = torch.cat((spatial_shapes.new_zeros( | |||
(1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) | |||
reference_points = get_reference_points([(h // 8, w // 8), | |||
(h // 16, w // 16), | |||
(h // 32, w // 32)], x.device) | |||
deform_inputs2 = [reference_points, spatial_shapes, level_start_index] | |||
return deform_inputs1, deform_inputs2 | |||
class ConvFFN(nn.Module): | |||
def __init__(self, | |||
in_features, | |||
hidden_features=None, | |||
out_features=None, | |||
act_layer=nn.GELU, | |||
drop=0.): | |||
super().__init__() | |||
out_features = out_features or in_features | |||
hidden_features = hidden_features or in_features | |||
self.fc1 = nn.Linear(in_features, hidden_features) | |||
self.dwconv = DWConv(hidden_features) | |||
self.act = act_layer() | |||
self.fc2 = nn.Linear(hidden_features, out_features) | |||
self.drop = nn.Dropout(drop) | |||
def forward(self, x, H, W): | |||
x = self.fc1(x) | |||
x = self.dwconv(x, H, W) | |||
x = self.act(x) | |||
x = self.drop(x) | |||
x = self.fc2(x) | |||
x = self.drop(x) | |||
return x | |||
class DWConv(nn.Module): | |||
def __init__(self, dim=768): | |||
super().__init__() | |||
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) | |||
def forward(self, x, H, W): | |||
B, N, C = x.shape | |||
n = N // 21 | |||
x1 = x[:, 0:16 * n, :].transpose(1, 2).view(B, C, H * 2, | |||
W * 2).contiguous() | |||
x2 = x[:, 16 * n:20 * n, :].transpose(1, 2).view(B, C, H, | |||
W).contiguous() | |||
x3 = x[:, 20 * n:, :].transpose(1, 2).view(B, C, H // 2, | |||
W // 2).contiguous() | |||
x1 = self.dwconv(x1).flatten(2).transpose(1, 2) | |||
x2 = self.dwconv(x2).flatten(2).transpose(1, 2) | |||
x3 = self.dwconv(x3).flatten(2).transpose(1, 2) | |||
x = torch.cat([x1, x2, x3], dim=1) | |||
return x | |||
class Extractor(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads=6, | |||
n_points=4, | |||
n_levels=1, | |||
deform_ratio=1.0, | |||
with_cffn=True, | |||
cffn_ratio=0.25, | |||
drop=0., | |||
drop_path=0., | |||
norm_layer=partial(nn.LayerNorm, eps=1e-6), | |||
with_cp=False): | |||
super().__init__() | |||
self.query_norm = norm_layer(dim) | |||
self.feat_norm = norm_layer(dim) | |||
self.attn = MultiScaleDeformableAttention( | |||
embed_dims=dim, | |||
num_heads=num_heads, | |||
num_levels=n_levels, | |||
num_points=n_points, | |||
batch_first=True) | |||
# modify to fit the deform_ratio | |||
value_proj_in_features = self.attn.value_proj.weight.shape[0] | |||
value_proj_out_features = int(value_proj_in_features * deform_ratio) | |||
self.attn.value_proj = nn.Linear(value_proj_in_features, | |||
value_proj_out_features) | |||
self.attn.output_proj = nn.Linear(value_proj_out_features, | |||
value_proj_in_features) | |||
self.with_cffn = with_cffn | |||
self.with_cp = with_cp | |||
if with_cffn: | |||
self.ffn = ConvFFN( | |||
in_features=dim, | |||
hidden_features=int(dim * cffn_ratio), | |||
drop=drop) | |||
self.ffn_norm = norm_layer(dim) | |||
self.drop_path = DropPath( | |||
drop_path) if drop_path > 0. else nn.Identity() | |||
def forward(self, query, reference_points, feat, spatial_shapes, | |||
level_start_index, H, W): | |||
def _inner_forward(query, feat): | |||
attn = self.attn( | |||
query=self.query_norm(query), | |||
key=None, | |||
value=self.feat_norm(feat), | |||
identity=None, | |||
query_pos=None, | |||
key_padding_mask=None, | |||
reference_points=reference_points, | |||
spatial_shapes=spatial_shapes, | |||
level_start_index=level_start_index) | |||
query = query + attn | |||
if self.with_cffn: | |||
query = query + self.drop_path( | |||
self.ffn(self.ffn_norm(query), H, W)) | |||
return query | |||
if self.with_cp and query.requires_grad: | |||
query = cp.checkpoint(_inner_forward, query, feat) | |||
else: | |||
query = _inner_forward(query, feat) | |||
return query | |||
class Injector(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads=6, | |||
n_points=4, | |||
n_levels=1, | |||
deform_ratio=1.0, | |||
norm_layer=partial(nn.LayerNorm, eps=1e-6), | |||
init_values=0., | |||
with_cp=False): | |||
super().__init__() | |||
self.with_cp = with_cp | |||
self.query_norm = norm_layer(dim) | |||
self.feat_norm = norm_layer(dim) | |||
self.attn = MultiScaleDeformableAttention( | |||
embed_dims=dim, | |||
num_heads=num_heads, | |||
num_levels=n_levels, | |||
num_points=n_points, | |||
batch_first=True) | |||
# modify to fit the deform_ratio | |||
value_proj_in_features = self.attn.value_proj.weight.shape[0] | |||
value_proj_out_features = int(value_proj_in_features * deform_ratio) | |||
self.attn.value_proj = nn.Linear(value_proj_in_features, | |||
value_proj_out_features) | |||
self.attn.output_proj = nn.Linear(value_proj_out_features, | |||
value_proj_in_features) | |||
self.gamma = nn.Parameter( | |||
init_values * torch.ones((dim)), requires_grad=True) | |||
def forward(self, query, reference_points, feat, spatial_shapes, | |||
level_start_index): | |||
def _inner_forward(query, feat): | |||
input_query = self.query_norm(query) | |||
input_value = self.feat_norm(feat) | |||
attn = self.attn( | |||
query=input_query, | |||
key=None, | |||
value=input_value, | |||
identity=None, | |||
query_pos=None, | |||
key_padding_mask=None, | |||
reference_points=reference_points, | |||
spatial_shapes=spatial_shapes, | |||
level_start_index=level_start_index) | |||
return query + self.gamma * attn | |||
if self.with_cp and query.requires_grad: | |||
query = cp.checkpoint(_inner_forward, query, feat) | |||
else: | |||
query = _inner_forward(query, feat) | |||
return query | |||
class InteractionBlock(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads=6, | |||
n_points=4, | |||
norm_layer=partial(nn.LayerNorm, eps=1e-6), | |||
drop=0., | |||
drop_path=0., | |||
with_cffn=True, | |||
cffn_ratio=0.25, | |||
init_values=0., | |||
deform_ratio=1.0, | |||
extra_extractor=False, | |||
with_cp=False): | |||
super().__init__() | |||
self.injector = Injector( | |||
dim=dim, | |||
n_levels=3, | |||
num_heads=num_heads, | |||
init_values=init_values, | |||
n_points=n_points, | |||
norm_layer=norm_layer, | |||
deform_ratio=deform_ratio, | |||
with_cp=with_cp) | |||
self.extractor = Extractor( | |||
dim=dim, | |||
n_levels=1, | |||
num_heads=num_heads, | |||
n_points=n_points, | |||
norm_layer=norm_layer, | |||
deform_ratio=deform_ratio, | |||
with_cffn=with_cffn, | |||
cffn_ratio=cffn_ratio, | |||
drop=drop, | |||
drop_path=drop_path, | |||
with_cp=with_cp) | |||
if extra_extractor: | |||
self.extra_extractors = nn.Sequential(*[ | |||
Extractor( | |||
dim=dim, | |||
num_heads=num_heads, | |||
n_points=n_points, | |||
norm_layer=norm_layer, | |||
with_cffn=with_cffn, | |||
cffn_ratio=cffn_ratio, | |||
deform_ratio=deform_ratio, | |||
drop=drop, | |||
drop_path=drop_path, | |||
with_cp=with_cp) for _ in range(2) | |||
]) | |||
else: | |||
self.extra_extractors = None | |||
def forward(self, x, c, blocks, deform_inputs1, deform_inputs2, H, W): | |||
x = self.injector( | |||
query=x, | |||
reference_points=deform_inputs1[0], | |||
feat=c, | |||
spatial_shapes=deform_inputs1[1], | |||
level_start_index=deform_inputs1[2]) | |||
for idx, blk in enumerate(blocks): | |||
x = blk(x, H, W) | |||
c = self.extractor( | |||
query=c, | |||
reference_points=deform_inputs2[0], | |||
feat=x, | |||
spatial_shapes=deform_inputs2[1], | |||
level_start_index=deform_inputs2[2], | |||
H=H, | |||
W=W) | |||
if self.extra_extractors is not None: | |||
for extractor in self.extra_extractors: | |||
c = extractor( | |||
query=c, | |||
reference_points=deform_inputs2[0], | |||
feat=x, | |||
spatial_shapes=deform_inputs2[1], | |||
level_start_index=deform_inputs2[2], | |||
H=H, | |||
W=W) | |||
return x, c | |||
class InteractionBlockWithCls(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads=6, | |||
n_points=4, | |||
norm_layer=partial(nn.LayerNorm, eps=1e-6), | |||
drop=0., | |||
drop_path=0., | |||
with_cffn=True, | |||
cffn_ratio=0.25, | |||
init_values=0., | |||
deform_ratio=1.0, | |||
extra_extractor=False, | |||
with_cp=False): | |||
super().__init__() | |||
self.injector = Injector( | |||
dim=dim, | |||
n_levels=3, | |||
num_heads=num_heads, | |||
init_values=init_values, | |||
n_points=n_points, | |||
norm_layer=norm_layer, | |||
deform_ratio=deform_ratio, | |||
with_cp=with_cp) | |||
self.extractor = Extractor( | |||
dim=dim, | |||
n_levels=1, | |||
num_heads=num_heads, | |||
n_points=n_points, | |||
norm_layer=norm_layer, | |||
deform_ratio=deform_ratio, | |||
with_cffn=with_cffn, | |||
cffn_ratio=cffn_ratio, | |||
drop=drop, | |||
drop_path=drop_path, | |||
with_cp=with_cp) | |||
if extra_extractor: | |||
self.extra_extractors = nn.Sequential(*[ | |||
Extractor( | |||
dim=dim, | |||
num_heads=num_heads, | |||
n_points=n_points, | |||
norm_layer=norm_layer, | |||
with_cffn=with_cffn, | |||
cffn_ratio=cffn_ratio, | |||
deform_ratio=deform_ratio, | |||
drop=drop, | |||
drop_path=drop_path, | |||
with_cp=with_cp) for _ in range(2) | |||
]) | |||
else: | |||
self.extra_extractors = None | |||
def forward(self, x, c, cls, blocks, deform_inputs1, deform_inputs2, H, W): | |||
x = self.injector( | |||
query=x, | |||
reference_points=deform_inputs1[0], | |||
feat=c, | |||
spatial_shapes=deform_inputs1[1], | |||
level_start_index=deform_inputs1[2]) | |||
x = torch.cat((cls, x), dim=1) | |||
for idx, blk in enumerate(blocks): | |||
x = blk(x, H, W) | |||
cls, x = x[:, :1, ], x[:, 1:, ] | |||
c = self.extractor( | |||
query=c, | |||
reference_points=deform_inputs2[0], | |||
feat=x, | |||
spatial_shapes=deform_inputs2[1], | |||
level_start_index=deform_inputs2[2], | |||
H=H, | |||
W=W) | |||
if self.extra_extractors is not None: | |||
for extractor in self.extra_extractors: | |||
c = extractor( | |||
query=c, | |||
reference_points=deform_inputs2[0], | |||
feat=x, | |||
spatial_shapes=deform_inputs2[1], | |||
level_start_index=deform_inputs2[2], | |||
H=H, | |||
W=W) | |||
return x, c, cls | |||
class SpatialPriorModule(nn.Module): | |||
def __init__(self, inplanes=64, embed_dim=384, with_cp=False): | |||
super().__init__() | |||
self.with_cp = with_cp | |||
self.stem = nn.Sequential(*[ | |||
nn.Conv2d( | |||
3, inplanes, kernel_size=3, stride=2, padding=1, bias=False), | |||
nn.SyncBatchNorm(inplanes), | |||
nn.ReLU(inplace=True), | |||
nn.Conv2d( | |||
inplanes, | |||
inplanes, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1, | |||
bias=False), | |||
nn.SyncBatchNorm(inplanes), | |||
nn.ReLU(inplace=True), | |||
nn.Conv2d( | |||
inplanes, | |||
inplanes, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1, | |||
bias=False), | |||
nn.SyncBatchNorm(inplanes), | |||
nn.ReLU(inplace=True), | |||
nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||
]) | |||
self.conv2 = nn.Sequential(*[ | |||
nn.Conv2d( | |||
inplanes, | |||
2 * inplanes, | |||
kernel_size=3, | |||
stride=2, | |||
padding=1, | |||
bias=False), | |||
nn.SyncBatchNorm(2 * inplanes), | |||
nn.ReLU(inplace=True) | |||
]) | |||
self.conv3 = nn.Sequential(*[ | |||
nn.Conv2d( | |||
2 * inplanes, | |||
4 * inplanes, | |||
kernel_size=3, | |||
stride=2, | |||
padding=1, | |||
bias=False), | |||
nn.SyncBatchNorm(4 * inplanes), | |||
nn.ReLU(inplace=True) | |||
]) | |||
self.conv4 = nn.Sequential(*[ | |||
nn.Conv2d( | |||
4 * inplanes, | |||
4 * inplanes, | |||
kernel_size=3, | |||
stride=2, | |||
padding=1, | |||
bias=False), | |||
nn.SyncBatchNorm(4 * inplanes), | |||
nn.ReLU(inplace=True) | |||
]) | |||
self.fc1 = nn.Conv2d( | |||
inplanes, embed_dim, kernel_size=1, stride=1, padding=0, bias=True) | |||
self.fc2 = nn.Conv2d( | |||
2 * inplanes, | |||
embed_dim, | |||
kernel_size=1, | |||
stride=1, | |||
padding=0, | |||
bias=True) | |||
self.fc3 = nn.Conv2d( | |||
4 * inplanes, | |||
embed_dim, | |||
kernel_size=1, | |||
stride=1, | |||
padding=0, | |||
bias=True) | |||
self.fc4 = nn.Conv2d( | |||
4 * inplanes, | |||
embed_dim, | |||
kernel_size=1, | |||
stride=1, | |||
padding=0, | |||
bias=True) | |||
def forward(self, x): | |||
def _inner_forward(x): | |||
c1 = self.stem(x) | |||
c2 = self.conv2(c1) | |||
c3 = self.conv3(c2) | |||
c4 = self.conv4(c3) | |||
c1 = self.fc1(c1) | |||
c2 = self.fc2(c2) | |||
c3 = self.fc3(c3) | |||
c4 = self.fc4(c4) | |||
bs, dim, _, _ = c1.shape | |||
c2 = c2.view(bs, dim, -1).transpose(1, 2) # 8s | |||
c3 = c3.view(bs, dim, -1).transpose(1, 2) # 16s | |||
c4 = c4.view(bs, dim, -1).transpose(1, 2) # 32s | |||
return c1, c2, c3, c4 | |||
if self.with_cp and x.requires_grad: | |||
outs = cp.checkpoint(_inner_forward, x) | |||
else: | |||
outs = _inner_forward(x) | |||
return outs |
@@ -0,0 +1,3 @@ | |||
from .beit import BASEBEiT | |||
__all__ = ['BASEBEiT'] |
@@ -0,0 +1,476 @@ | |||
# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) | |||
# Github source: https://github.com/microsoft/unilm/tree/master/beit | |||
# This implementation refers to | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import math | |||
from functools import partial | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torch.utils.checkpoint as cp | |||
from mmcv.runner import _load_checkpoint | |||
from mmdet.models.builder import BACKBONES | |||
from mmdet.utils import get_root_logger | |||
from timm.models.layers import drop_path, to_2tuple, trunc_normal_ | |||
class DropPath(nn.Module): | |||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of | |||
residual blocks).""" | |||
def __init__(self, drop_prob=None): | |||
super(DropPath, self).__init__() | |||
self.drop_prob = drop_prob | |||
def forward(self, x): | |||
return drop_path(x, self.drop_prob, self.training) | |||
def extra_repr(self) -> str: | |||
return 'p={}'.format(self.drop_prob) | |||
class Mlp(nn.Module): | |||
def __init__(self, | |||
in_features, | |||
hidden_features=None, | |||
out_features=None, | |||
act_layer=nn.GELU, | |||
drop=0.): | |||
super().__init__() | |||
out_features = out_features or in_features | |||
hidden_features = hidden_features or in_features | |||
self.fc1 = nn.Linear(in_features, hidden_features) | |||
self.act = act_layer() | |||
self.fc2 = nn.Linear(hidden_features, out_features) | |||
self.drop = nn.Dropout(drop) | |||
def forward(self, x): | |||
x = self.fc1(x) | |||
x = self.act(x) | |||
# commit dropout for the original BERT implement | |||
x = self.fc2(x) | |||
x = self.drop(x) | |||
return x | |||
class Attention(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads=8, | |||
qkv_bias=False, | |||
qk_scale=None, | |||
attn_drop=0., | |||
proj_drop=0., | |||
window_size=None, | |||
attn_head_dim=None): | |||
super().__init__() | |||
self.num_heads = num_heads | |||
head_dim = dim // num_heads | |||
if attn_head_dim is not None: | |||
head_dim = attn_head_dim | |||
all_head_dim = head_dim * self.num_heads | |||
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights | |||
self.scale = qk_scale or head_dim**-0.5 | |||
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) | |||
if qkv_bias: | |||
self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) | |||
self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) | |||
else: | |||
self.q_bias = None | |||
self.v_bias = None | |||
if window_size: | |||
self.window_size = window_size | |||
self.num_relative_distance = (2 * window_size[0] | |||
- 1) * (2 * window_size[1] - 1) + 3 | |||
self.relative_position_bias_table = nn.Parameter( | |||
torch.zeros(self.num_relative_distance, | |||
num_heads)) # 2*Wh-1 * 2*Ww-1, nH | |||
# cls to token & token 2 cls & cls to cls | |||
# get pair-wise relative position index for each token inside the window | |||
coords_h = torch.arange(window_size[0]) | |||
coords_w = torch.arange(window_size[1]) | |||
coords = torch.stack(torch.meshgrid([coords_h, | |||
coords_w])) # 2, Wh, Ww | |||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww | |||
relative_coords = coords_flatten[:, :, | |||
None] - coords_flatten[:, | |||
None, :] # 2, Wh*Ww, Wh*Ww | |||
relative_coords = relative_coords.permute( | |||
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 | |||
relative_coords[:, :, | |||
0] += window_size[0] - 1 # shift to start from 0 | |||
relative_coords[:, :, 1] += window_size[1] - 1 | |||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1 | |||
relative_position_index = \ | |||
torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) | |||
relative_position_index[1:, 1:] = relative_coords.sum( | |||
-1) # Wh*Ww, Wh*Ww | |||
relative_position_index[0, 0:] = self.num_relative_distance - 3 | |||
relative_position_index[0:, 0] = self.num_relative_distance - 2 | |||
relative_position_index[0, 0] = self.num_relative_distance - 1 | |||
self.register_buffer('relative_position_index', | |||
relative_position_index) | |||
else: | |||
self.window_size = None | |||
self.relative_position_bias_table = None | |||
self.relative_position_index = None | |||
self.attn_drop = nn.Dropout(attn_drop) | |||
self.proj = nn.Linear(all_head_dim, dim) | |||
self.proj_drop = nn.Dropout(proj_drop) | |||
def forward(self, x, rel_pos_bias=None): | |||
B, N, C = x.shape | |||
qkv_bias = None | |||
if self.q_bias is not None: | |||
qkv_bias = torch.cat( | |||
(self.q_bias, | |||
torch.zeros_like(self.v_bias, | |||
requires_grad=False), self.v_bias)) | |||
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) | |||
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) | |||
q, k, v = qkv[0], qkv[1], qkv[ | |||
2] # make torchscript happy (cannot use tensor as tuple) | |||
q = q * self.scale | |||
attn = (q @ k.transpose(-2, -1)) | |||
if self.relative_position_bias_table is not None: | |||
relative_position_bias = \ | |||
self.relative_position_bias_table[self.relative_position_index.view(-1)].view( | |||
self.window_size[0] * self.window_size[1] + 1, | |||
self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH | |||
relative_position_bias = relative_position_bias.permute( | |||
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww | |||
attn = attn + relative_position_bias.unsqueeze(0) | |||
if rel_pos_bias is not None: | |||
attn = attn + rel_pos_bias | |||
attn = attn.softmax(dim=-1) | |||
attn = self.attn_drop(attn) | |||
x = (attn @ v).transpose(1, 2).reshape(B, N, -1) | |||
x = self.proj(x) | |||
x = self.proj_drop(x) | |||
return x | |||
class Block(nn.Module): | |||
def __init__(self, | |||
dim, | |||
num_heads, | |||
mlp_ratio=4., | |||
qkv_bias=False, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
init_values=None, | |||
act_layer=nn.GELU, | |||
norm_layer=nn.LayerNorm, | |||
window_size=None, | |||
attn_head_dim=None, | |||
with_cp=False): | |||
super().__init__() | |||
self.with_cp = with_cp | |||
self.norm1 = norm_layer(dim) | |||
self.attn = Attention( | |||
dim, | |||
num_heads=num_heads, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
attn_drop=attn_drop, | |||
proj_drop=drop, | |||
window_size=window_size, | |||
attn_head_dim=attn_head_dim) | |||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here | |||
self.drop_path = DropPath( | |||
drop_path) if drop_path > 0. else nn.Identity() | |||
self.norm2 = norm_layer(dim) | |||
mlp_hidden_dim = int(dim * mlp_ratio) | |||
self.mlp = Mlp( | |||
in_features=dim, | |||
hidden_features=mlp_hidden_dim, | |||
act_layer=act_layer, | |||
drop=drop) | |||
if init_values is not None: | |||
self.gamma_1 = nn.Parameter( | |||
init_values * torch.ones((dim)), requires_grad=True) | |||
self.gamma_2 = nn.Parameter( | |||
init_values * torch.ones((dim)), requires_grad=True) | |||
else: | |||
self.gamma_1, self.gamma_2 = None, None | |||
def forward(self, x, H, W, rel_pos_bias=None): | |||
def _inner_forward(x): | |||
if self.gamma_1 is None: | |||
x = x + self.drop_path( | |||
self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) | |||
x = x + self.drop_path(self.mlp(self.norm2(x))) | |||
else: | |||
x = x + self.drop_path(self.gamma_1 * self.attn( | |||
self.norm1(x), rel_pos_bias=rel_pos_bias)) | |||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) | |||
return x | |||
if self.with_cp and x.requires_grad: | |||
x = cp.checkpoint(_inner_forward, x) | |||
else: | |||
x = _inner_forward(x) | |||
return x | |||
class PatchEmbed(nn.Module): | |||
""" Image to Patch Embedding | |||
""" | |||
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): | |||
super().__init__() | |||
img_size = to_2tuple(img_size) | |||
patch_size = to_2tuple(patch_size) | |||
num_patches = (img_size[1] // patch_size[1]) * ( | |||
img_size[0] // patch_size[0]) | |||
self.patch_shape = (img_size[0] // patch_size[0], | |||
img_size[1] // patch_size[1]) | |||
self.img_size = img_size | |||
self.patch_size = patch_size | |||
self.num_patches = num_patches | |||
self.proj = nn.Conv2d( | |||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) | |||
def forward(self, x, **kwargs): | |||
B, C, H, W = x.shape | |||
# FIXME look at relaxing size constraints | |||
# assert H == self.img_size[0] and W == self.img_size[1], \ | |||
# f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." | |||
x = self.proj(x) | |||
Hp, Wp = x.shape[2], x.shape[3] | |||
x = x.flatten(2).transpose(1, 2) | |||
return x, Hp, Wp | |||
class HybridEmbed(nn.Module): | |||
""" CNN Feature Map Embedding | |||
Extract feature map from CNN, flatten, project to embedding dim. | |||
""" | |||
def __init__(self, | |||
backbone, | |||
img_size=224, | |||
feature_size=None, | |||
in_chans=3, | |||
embed_dim=768): | |||
super().__init__() | |||
assert isinstance(backbone, nn.Module) | |||
img_size = to_2tuple(img_size) | |||
self.img_size = img_size | |||
self.backbone = backbone | |||
if feature_size is None: | |||
with torch.no_grad(): | |||
# FIXME this is hacky, but most reliable way of determining the exact dim of the output feature | |||
# map for all networks, the feature metadata has reliable channel and stride info, but using | |||
# stride to calc feature dim requires info about padding of each stage that isn't captured. | |||
training = backbone.training | |||
if training: | |||
backbone.eval() | |||
o = self.backbone( | |||
torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] | |||
feature_size = o.shape[-2:] | |||
feature_dim = o.shape[1] | |||
backbone.train(training) | |||
else: | |||
feature_size = to_2tuple(feature_size) | |||
feature_dim = self.backbone.feature_info.channels()[-1] | |||
self.num_patches = feature_size[0] * feature_size[1] | |||
self.proj = nn.Linear(feature_dim, embed_dim) | |||
def forward(self, x): | |||
x = self.backbone(x)[-1] | |||
x = x.flatten(2).transpose(1, 2) | |||
x = self.proj(x) | |||
return x | |||
class RelativePositionBias(nn.Module): | |||
def __init__(self, window_size, num_heads): | |||
super().__init__() | |||
self.window_size = window_size | |||
self.num_relative_distance = (2 * window_size[0] | |||
- 1) * (2 * window_size[1] - 1) + 3 | |||
self.relative_position_bias_table = nn.Parameter( | |||
torch.zeros(self.num_relative_distance, | |||
num_heads)) # 2*Wh-1 * 2*Ww-1, nH | |||
# cls to token & token 2 cls & cls to cls | |||
# get pair-wise relative position index for each token inside the window | |||
coords_h = torch.arange(window_size[0]) | |||
coords_w = torch.arange(window_size[1]) | |||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww | |||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww | |||
relative_coords = coords_flatten[:, :, | |||
None] - coords_flatten[:, | |||
None, :] # 2, Wh*Ww, Wh*Ww | |||
relative_coords = relative_coords.permute( | |||
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 | |||
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 | |||
relative_coords[:, :, 1] += window_size[1] - 1 | |||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1 | |||
relative_position_index = \ | |||
torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) | |||
relative_position_index[1:, | |||
1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww | |||
relative_position_index[0, 0:] = self.num_relative_distance - 3 | |||
relative_position_index[0:, 0] = self.num_relative_distance - 2 | |||
relative_position_index[0, 0] = self.num_relative_distance - 1 | |||
self.register_buffer('relative_position_index', | |||
relative_position_index) | |||
def forward(self): | |||
relative_position_bias = \ | |||
self.relative_position_bias_table[self.relative_position_index.view(-1)].view( | |||
self.window_size[0] * self.window_size[1] + 1, | |||
self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH | |||
return relative_position_bias.permute( | |||
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww | |||
@BACKBONES.register_module() | |||
class BASEBEiT(nn.Module): | |||
""" Vision Transformer with support for patch or hybrid CNN input stage | |||
""" | |||
def __init__(self, | |||
img_size=512, | |||
patch_size=16, | |||
in_chans=3, | |||
num_classes=80, | |||
embed_dim=768, | |||
depth=12, | |||
num_heads=12, | |||
mlp_ratio=4., | |||
qkv_bias=False, | |||
qk_scale=None, | |||
drop_rate=0., | |||
attn_drop_rate=0., | |||
drop_path_rate=0., | |||
hybrid_backbone=None, | |||
norm_layer=None, | |||
init_values=None, | |||
use_checkpoint=False, | |||
use_abs_pos_emb=False, | |||
use_rel_pos_bias=True, | |||
use_shared_rel_pos_bias=False, | |||
pretrained=None, | |||
with_cp=False): | |||
super().__init__() | |||
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) | |||
self.norm_layer = norm_layer | |||
self.num_classes = num_classes | |||
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models | |||
self.drop_path_rate = drop_path_rate | |||
if hybrid_backbone is not None: | |||
self.patch_embed = HybridEmbed( | |||
hybrid_backbone, | |||
img_size=img_size, | |||
in_chans=in_chans, | |||
embed_dim=embed_dim) | |||
else: | |||
self.patch_embed = PatchEmbed( | |||
img_size=img_size, | |||
patch_size=patch_size, | |||
in_chans=in_chans, | |||
embed_dim=embed_dim) | |||
num_patches = self.patch_embed.num_patches | |||
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) | |||
if use_abs_pos_emb: | |||
self.pos_embed = nn.Parameter( | |||
torch.zeros(1, num_patches + 1, embed_dim)) | |||
else: | |||
self.pos_embed = None | |||
self.pos_drop = nn.Dropout(p=drop_rate) | |||
if use_shared_rel_pos_bias: | |||
self.rel_pos_bias = RelativePositionBias( | |||
window_size=self.patch_embed.patch_shape, num_heads=num_heads) | |||
else: | |||
self.rel_pos_bias = None | |||
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) | |||
] # stochastic depth decay rule | |||
self.use_rel_pos_bias = use_rel_pos_bias | |||
self.use_checkpoint = use_checkpoint | |||
self.blocks = nn.ModuleList([ | |||
Block( | |||
dim=embed_dim, | |||
num_heads=num_heads, | |||
mlp_ratio=mlp_ratio, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
drop=drop_rate, | |||
attn_drop=attn_drop_rate, | |||
drop_path=dpr[i], | |||
norm_layer=norm_layer, | |||
with_cp=with_cp, | |||
init_values=init_values, | |||
window_size=self.patch_embed.patch_shape | |||
if use_rel_pos_bias else None) for i in range(depth) | |||
]) | |||
trunc_normal_(self.cls_token, std=.02) | |||
self.apply(self._init_weights) | |||
self.init_weights(pretrained) | |||
def init_weights(self, pretrained=None): | |||
"""Initialize the weights in backbone. | |||
Args: | |||
pretrained (str, optional): Path to pre-trained weights. | |||
Defaults to None. | |||
""" | |||
if isinstance(pretrained, str): | |||
logger = get_root_logger() | |||
init_cfg = dict(type='Pretrained', checkpoint=pretrained) | |||
checkpoint = _load_checkpoint( | |||
init_cfg['checkpoint'], logger=logger, map_location='cpu') | |||
state_dict = self.resize_rel_pos_embed(checkpoint) | |||
self.load_state_dict(state_dict, False) | |||
def fix_init_weight(self): | |||
def rescale(param, layer_id): | |||
param.div_(math.sqrt(2.0 * layer_id)) | |||
for layer_id, layer in enumerate(self.blocks): | |||
rescale(layer.attn.proj.weight.data, layer_id + 1) | |||
rescale(layer.mlp.fc2.weight.data, layer_id + 1) | |||
def _init_weights(self, m): | |||
if isinstance(m, nn.Linear): | |||
trunc_normal_(m.weight, std=.02) | |||
if isinstance(m, nn.Linear) and m.bias is not None: | |||
nn.init.constant_(m.bias, 0) | |||
elif isinstance(m, nn.LayerNorm): | |||
nn.init.constant_(m.bias, 0) | |||
nn.init.constant_(m.weight, 1.0) | |||
def get_num_layers(self): | |||
return len(self.blocks) |
@@ -0,0 +1,169 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import logging | |||
import math | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from mmdet.models.builder import BACKBONES | |||
from mmdet.models.utils.transformer import MultiScaleDeformableAttention | |||
from timm.models.layers import DropPath, trunc_normal_ | |||
from torch.nn.init import normal_ | |||
from .adapter_modules import InteractionBlockWithCls as InteractionBlock | |||
from .adapter_modules import SpatialPriorModule, deform_inputs | |||
from .base.beit import BASEBEiT | |||
_logger = logging.getLogger(__name__) | |||
@BACKBONES.register_module() | |||
class BEiTAdapter(BASEBEiT): | |||
def __init__(self, | |||
pretrain_size=224, | |||
conv_inplane=64, | |||
n_points=4, | |||
deform_num_heads=6, | |||
init_values=0., | |||
cffn_ratio=0.25, | |||
deform_ratio=1.0, | |||
with_cffn=True, | |||
interaction_indexes=None, | |||
add_vit_feature=True, | |||
with_cp=False, | |||
*args, | |||
**kwargs): | |||
super().__init__( | |||
init_values=init_values, with_cp=with_cp, *args, **kwargs) | |||
self.num_block = len(self.blocks) | |||
self.pretrain_size = (pretrain_size, pretrain_size) | |||
self.flags = [ | |||
i for i in range(-1, self.num_block, self.num_block // 4) | |||
][1:] | |||
self.interaction_indexes = interaction_indexes | |||
self.add_vit_feature = add_vit_feature | |||
embed_dim = self.embed_dim | |||
self.level_embed = nn.Parameter(torch.zeros(3, embed_dim)) | |||
self.spm = SpatialPriorModule( | |||
inplanes=conv_inplane, embed_dim=embed_dim, with_cp=False) | |||
self.interactions = nn.Sequential(*[ | |||
InteractionBlock( | |||
dim=embed_dim, | |||
num_heads=deform_num_heads, | |||
n_points=n_points, | |||
init_values=init_values, | |||
drop_path=self.drop_path_rate, | |||
norm_layer=self.norm_layer, | |||
with_cffn=with_cffn, | |||
cffn_ratio=cffn_ratio, | |||
deform_ratio=deform_ratio, | |||
extra_extractor=True if i == len(interaction_indexes) | |||
- 1 else False, | |||
with_cp=with_cp) for i in range(len(interaction_indexes)) | |||
]) | |||
self.up = nn.ConvTranspose2d(embed_dim, embed_dim, 2, 2) | |||
self.norm1 = nn.SyncBatchNorm(embed_dim) | |||
self.norm2 = nn.SyncBatchNorm(embed_dim) | |||
self.norm3 = nn.SyncBatchNorm(embed_dim) | |||
self.norm4 = nn.SyncBatchNorm(embed_dim) | |||
self.up.apply(self._init_weights) | |||
self.spm.apply(self._init_weights) | |||
self.interactions.apply(self._init_weights) | |||
self.apply(self._init_deform_weights) | |||
normal_(self.level_embed) | |||
def _init_weights(self, m): | |||
if isinstance(m, nn.Linear): | |||
trunc_normal_(m.weight, std=.02) | |||
if isinstance(m, nn.Linear) and m.bias is not None: | |||
nn.init.constant_(m.bias, 0) | |||
elif isinstance(m, nn.LayerNorm) or isinstance(m, nn.BatchNorm2d): | |||
nn.init.constant_(m.bias, 0) | |||
nn.init.constant_(m.weight, 1.0) | |||
elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): | |||
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||
fan_out //= m.groups | |||
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) | |||
if m.bias is not None: | |||
m.bias.data.zero_() | |||
def _get_pos_embed(self, pos_embed, H, W): | |||
pos_embed = pos_embed.reshape(1, self.pretrain_size[0] // 16, | |||
self.pretrain_size[1] // 16, | |||
-1).permute(0, 3, 1, 2) | |||
pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \ | |||
reshape(1, -1, H * W).permute(0, 2, 1) | |||
return pos_embed | |||
def _init_deform_weights(self, m): | |||
if isinstance(m, MultiScaleDeformableAttention): | |||
m.init_weights() | |||
def _add_level_embed(self, c2, c3, c4): | |||
c2 = c2 + self.level_embed[0] | |||
c3 = c3 + self.level_embed[1] | |||
c4 = c4 + self.level_embed[2] | |||
return c2, c3, c4 | |||
def forward(self, x): | |||
deform_inputs1, deform_inputs2 = deform_inputs(x) | |||
# SPM forward | |||
c1, c2, c3, c4 = self.spm(x) | |||
c2, c3, c4 = self._add_level_embed(c2, c3, c4) | |||
c = torch.cat([c2, c3, c4], dim=1) | |||
# Patch Embedding forward | |||
x, H, W = self.patch_embed(x) | |||
bs, n, dim = x.shape | |||
cls = self.cls_token.expand( | |||
bs, -1, -1) # stole cls_tokens impl from Phil Wang, thanks | |||
if self.pos_embed is not None: | |||
pos_embed = self._get_pos_embed(self.pos_embed, H, W) | |||
x = x + pos_embed | |||
x = self.pos_drop(x) | |||
# Interaction | |||
outs = list() | |||
for i, layer in enumerate(self.interactions): | |||
indexes = self.interaction_indexes[i] | |||
x, c, cls = layer(x, c, cls, | |||
self.blocks[indexes[0]:indexes[-1] + 1], | |||
deform_inputs1, deform_inputs2, H, W) | |||
outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous()) | |||
# Split & Reshape | |||
c2 = c[:, 0:c2.size(1), :] | |||
c3 = c[:, c2.size(1):c2.size(1) + c3.size(1), :] | |||
c4 = c[:, c2.size(1) + c3.size(1):, :] | |||
c2 = c2.transpose(1, 2).view(bs, dim, H * 2, W * 2).contiguous() | |||
c3 = c3.transpose(1, 2).view(bs, dim, H, W).contiguous() | |||
c4 = c4.transpose(1, 2).view(bs, dim, H // 2, W // 2).contiguous() | |||
c1 = self.up(c2) + c1 | |||
if self.add_vit_feature: | |||
x1, x2, x3, x4 = outs | |||
x1 = F.interpolate( | |||
x1, scale_factor=4, mode='bilinear', align_corners=False) | |||
x2 = F.interpolate( | |||
x2, scale_factor=2, mode='bilinear', align_corners=False) | |||
x4 = F.interpolate( | |||
x4, scale_factor=0.5, mode='bilinear', align_corners=False) | |||
c1, c2, c3, c4 = c1 + x1, c2 + x2, c3 + x3, c4 + x4 | |||
# Final Norm | |||
f1 = self.norm1(c1) | |||
f2 = self.norm2(c2) | |||
f3 = self.norm3(c3) | |||
f4 = self.norm4(c4) | |||
return [f1, f2, f3, f4] |
@@ -0,0 +1,3 @@ | |||
from .mask2former_head_from_mmseg import Mask2FormerHeadFromMMSeg | |||
__all__ = ['Mask2FormerHeadFromMMSeg'] |
@@ -0,0 +1,267 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
from abc import ABCMeta, abstractmethod | |||
import torch | |||
import torch.nn as nn | |||
from mmcv.runner import BaseModule, auto_fp16, force_fp32 | |||
from mmdet.models.builder import build_loss | |||
from mmdet.models.losses import accuracy | |||
from ...utils import build_pixel_sampler, seg_resize | |||
class BaseDecodeHead(BaseModule, metaclass=ABCMeta): | |||
"""Base class for BaseDecodeHead. | |||
Args: | |||
in_channels (int|Sequence[int]): Input channels. | |||
channels (int): Channels after modules, before conv_seg. | |||
num_classes (int): Number of classes. | |||
dropout_ratio (float): Ratio of dropout layer. Default: 0.1. | |||
conv_cfg (dict|None): Config of conv layers. Default: None. | |||
norm_cfg (dict|None): Config of norm layers. Default: None. | |||
act_cfg (dict): Config of activation layers. | |||
Default: dict(type='ReLU') | |||
in_index (int|Sequence[int]): Input feature index. Default: -1 | |||
input_transform (str|None): Transformation type of input features. | |||
Options: 'resize_concat', 'multiple_select', None. | |||
'resize_concat': Multiple feature maps will be resize to the | |||
same size as first one and than concat together. | |||
Usually used in FCN head of HRNet. | |||
'multiple_select': Multiple feature maps will be bundle into | |||
a list and passed into decode head. | |||
None: Only one select feature map is allowed. | |||
Default: None. | |||
loss_decode (dict | Sequence[dict]): Config of decode loss. | |||
The `loss_name` is property of corresponding loss function which | |||
could be shown in training log. If you want this loss | |||
item to be included into the backward graph, `loss_` must be the | |||
prefix of the name. Defaults to 'loss_ce'. | |||
e.g. dict(type='CrossEntropyLoss'), | |||
[dict(type='CrossEntropyLoss', loss_name='loss_ce'), | |||
dict(type='DiceLoss', loss_name='loss_dice')] | |||
Default: dict(type='CrossEntropyLoss'). | |||
ignore_index (int | None): The label index to be ignored. When using | |||
masked BCE loss, ignore_index should be set to None. Default: 255. | |||
sampler (dict|None): The config of segmentation map sampler. | |||
Default: None. | |||
align_corners (bool): align_corners argument of F.interpolate. | |||
Default: False. | |||
init_cfg (dict or list[dict], optional): Initialization config dict. | |||
""" | |||
def __init__(self, | |||
in_channels, | |||
channels, | |||
*, | |||
num_classes, | |||
dropout_ratio=0.1, | |||
conv_cfg=None, | |||
norm_cfg=None, | |||
act_cfg=dict(type='ReLU'), | |||
in_index=-1, | |||
input_transform=None, | |||
loss_decode=dict( | |||
type='CrossEntropyLoss', | |||
use_sigmoid=False, | |||
loss_weight=1.0), | |||
ignore_index=255, | |||
sampler=None, | |||
align_corners=False, | |||
init_cfg=dict( | |||
type='Normal', std=0.01, override=dict(name='conv_seg'))): | |||
super(BaseDecodeHead, self).__init__(init_cfg) | |||
self._init_inputs(in_channels, in_index, input_transform) | |||
self.channels = channels | |||
self.num_classes = num_classes | |||
self.dropout_ratio = dropout_ratio | |||
self.conv_cfg = conv_cfg | |||
self.norm_cfg = norm_cfg | |||
self.act_cfg = act_cfg | |||
self.in_index = in_index | |||
self.ignore_index = ignore_index | |||
self.align_corners = align_corners | |||
if isinstance(loss_decode, dict): | |||
self.loss_decode = build_loss(loss_decode) | |||
elif isinstance(loss_decode, (list, tuple)): | |||
self.loss_decode = nn.ModuleList() | |||
for loss in loss_decode: | |||
self.loss_decode.append(build_loss(loss)) | |||
else: | |||
raise TypeError(f'loss_decode must be a dict or sequence of dict,\ | |||
but got {type(loss_decode)}') | |||
if sampler is not None: | |||
self.sampler = build_pixel_sampler(sampler, context=self) | |||
else: | |||
self.sampler = None | |||
self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) | |||
if dropout_ratio > 0: | |||
self.dropout = nn.Dropout2d(dropout_ratio) | |||
else: | |||
self.dropout = None | |||
self.fp16_enabled = False | |||
def extra_repr(self): | |||
"""Extra repr.""" | |||
s = f'input_transform={self.input_transform}, ' \ | |||
f'ignore_index={self.ignore_index}, ' \ | |||
f'align_corners={self.align_corners}' | |||
return s | |||
def _init_inputs(self, in_channels, in_index, input_transform): | |||
"""Check and initialize input transforms. | |||
The in_channels, in_index and input_transform must match. | |||
Specifically, when input_transform is None, only single feature map | |||
will be selected. So in_channels and in_index must be of type int. | |||
When input_transform | |||
Args: | |||
in_channels (int|Sequence[int]): Input channels. | |||
in_index (int|Sequence[int]): Input feature index. | |||
input_transform (str|None): Transformation type of input features. | |||
Options: 'resize_concat', 'multiple_select', None. | |||
'resize_concat': Multiple feature maps will be resize to the | |||
same size as first one and than concat together. | |||
Usually used in FCN head of HRNet. | |||
'multiple_select': Multiple feature maps will be bundle into | |||
a list and passed into decode head. | |||
None: Only one select feature map is allowed. | |||
""" | |||
if input_transform is not None: | |||
assert input_transform in ['resize_concat', 'multiple_select'] | |||
self.input_transform = input_transform | |||
self.in_index = in_index | |||
if input_transform is not None: | |||
assert isinstance(in_channels, (list, tuple)) | |||
assert isinstance(in_index, (list, tuple)) | |||
assert len(in_channels) == len(in_index) | |||
if input_transform == 'resize_concat': | |||
self.in_channels = sum(in_channels) | |||
else: | |||
self.in_channels = in_channels | |||
else: | |||
assert isinstance(in_channels, int) | |||
assert isinstance(in_index, int) | |||
self.in_channels = in_channels | |||
def _transform_inputs(self, inputs): | |||
"""Transform inputs for decoder. | |||
Args: | |||
inputs (list[Tensor]): List of multi-level img features. | |||
Returns: | |||
Tensor: The transformed inputs | |||
""" | |||
if self.input_transform == 'resize_concat': | |||
inputs = [inputs[i] for i in self.in_index] | |||
upsampled_inputs = [ | |||
seg_resize( | |||
input=x, | |||
size=inputs[0].shape[2:], | |||
mode='bilinear', | |||
align_corners=self.align_corners) for x in inputs | |||
] | |||
inputs = torch.cat(upsampled_inputs, dim=1) | |||
elif self.input_transform == 'multiple_select': | |||
inputs = [inputs[i] for i in self.in_index] | |||
else: | |||
inputs = inputs[self.in_index] | |||
return inputs | |||
@auto_fp16() | |||
@abstractmethod | |||
def forward(self, inputs): | |||
"""Placeholder of forward function.""" | |||
pass | |||
def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): | |||
"""Forward function for training. | |||
Args: | |||
inputs (list[Tensor]): List of multi-level img features. | |||
img_metas (list[dict]): List of image info dict where each dict | |||
has: 'img_shape', 'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
`mmseg/datasets/pipelines/formatting.py:Collect`. | |||
gt_semantic_seg (Tensor): Semantic segmentation masks | |||
used if the architecture supports semantic segmentation task. | |||
train_cfg (dict): The training config. | |||
Returns: | |||
dict[str, Tensor]: a dictionary of loss components | |||
""" | |||
seg_logits = self.forward(inputs) | |||
losses = self.losses(seg_logits, gt_semantic_seg) | |||
return losses | |||
def forward_test(self, inputs, img_metas, test_cfg): | |||
"""Forward function for testing. | |||
Args: | |||
inputs (list[Tensor]): List of multi-level img features. | |||
img_metas (list[dict]): List of image info dict where each dict | |||
has: 'img_shape', 'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
`mmseg/datasets/pipelines/formatting.py:Collect`. | |||
test_cfg (dict): The testing config. | |||
Returns: | |||
Tensor: Output segmentation map. | |||
""" | |||
return self.forward(inputs) | |||
def cls_seg(self, feat): | |||
"""Classify each pixel.""" | |||
if self.dropout is not None: | |||
feat = self.dropout(feat) | |||
output = self.conv_seg(feat) | |||
return output | |||
@force_fp32(apply_to=('seg_logit', )) | |||
def losses(self, seg_logit, seg_label): | |||
"""Compute segmentation loss.""" | |||
loss = dict() | |||
seg_logit = seg_resize( | |||
input=seg_logit, | |||
size=seg_label.shape[2:], | |||
mode='bilinear', | |||
align_corners=self.align_corners) | |||
if self.sampler is not None: | |||
seg_weight = self.sampler.sample(seg_logit, seg_label) | |||
else: | |||
seg_weight = None | |||
seg_label = seg_label.squeeze(1) | |||
if not isinstance(self.loss_decode, nn.ModuleList): | |||
losses_decode = [self.loss_decode] | |||
else: | |||
losses_decode = self.loss_decode | |||
for loss_decode in losses_decode: | |||
if loss_decode.loss_name not in loss: | |||
loss[loss_decode.loss_name] = loss_decode( | |||
seg_logit, | |||
seg_label, | |||
weight=seg_weight, | |||
ignore_index=self.ignore_index) | |||
else: | |||
loss[loss_decode.loss_name] += loss_decode( | |||
seg_logit, | |||
seg_label, | |||
weight=seg_weight, | |||
ignore_index=self.ignore_index) | |||
loss['acc_seg'] = accuracy( | |||
seg_logit, seg_label, ignore_index=self.ignore_index) | |||
return loss |
@@ -0,0 +1,581 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import copy | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from mmcv.cnn import Conv2d, build_plugin_layer, caffe2_xavier_init | |||
from mmcv.cnn.bricks.transformer import (build_positional_encoding, | |||
build_transformer_layer_sequence) | |||
from mmcv.ops import point_sample | |||
from mmcv.runner import ModuleList, force_fp32 | |||
from mmdet.core import build_assigner, build_sampler, multi_apply, reduce_mean | |||
from mmdet.models.builder import HEADS, build_loss | |||
from mmdet.models.utils import get_uncertain_point_coords_with_randomness | |||
from .base_decode_head import BaseDecodeHead | |||
@HEADS.register_module() | |||
class Mask2FormerHeadFromMMSeg(BaseDecodeHead): | |||
"""Implements the Mask2Former head. | |||
See `Masked-attention Mask Transformer for Universal Image | |||
Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details. | |||
Args: | |||
in_channels (list[int]): Number of channels in the input feature map. | |||
feat_channels (int): Number of channels for features. | |||
out_channels (int): Number of channels for output. | |||
num_things_classes (int): Number of things. | |||
num_stuff_classes (int): Number of stuff. | |||
num_queries (int): Number of query in Transformer decoder. | |||
pixel_decoder (:obj:`mmcv.ConfigDict` | dict): Config for pixel | |||
decoder. Defaults to None. | |||
enforce_decoder_input_project (bool, optional): Whether to add | |||
a layer to change the embed_dim of tranformer encoder in | |||
pixel decoder to the embed_dim of transformer decoder. | |||
Defaults to False. | |||
transformer_decoder (:obj:`mmcv.ConfigDict` | dict): Config for | |||
transformer decoder. Defaults to None. | |||
positional_encoding (:obj:`mmcv.ConfigDict` | dict): Config for | |||
transformer decoder position encoding. Defaults to None. | |||
loss_cls (:obj:`mmcv.ConfigDict` | dict): Config of the classification | |||
loss. Defaults to None. | |||
loss_mask (:obj:`mmcv.ConfigDict` | dict): Config of the mask loss. | |||
Defaults to None. | |||
loss_dice (:obj:`mmcv.ConfigDict` | dict): Config of the dice loss. | |||
Defaults to None. | |||
train_cfg (:obj:`mmcv.ConfigDict` | dict): Training config of | |||
Mask2Former head. | |||
test_cfg (:obj:`mmcv.ConfigDict` | dict): Testing config of | |||
Mask2Former head. | |||
init_cfg (dict or list[dict], optional): Initialization config dict. | |||
Defaults to None. | |||
""" | |||
def __init__(self, | |||
in_channels, | |||
feat_channels, | |||
out_channels, | |||
num_things_classes=80, | |||
num_stuff_classes=53, | |||
num_queries=100, | |||
num_transformer_feat_level=3, | |||
pixel_decoder=None, | |||
enforce_decoder_input_project=False, | |||
transformer_decoder=None, | |||
positional_encoding=None, | |||
loss_cls=None, | |||
loss_mask=None, | |||
loss_dice=None, | |||
train_cfg=None, | |||
test_cfg=None, | |||
init_cfg=None, | |||
**kwargs): | |||
super(Mask2FormerHeadFromMMSeg, self).__init__( | |||
in_channels=in_channels, | |||
channels=feat_channels, | |||
num_classes=(num_things_classes + num_stuff_classes), | |||
init_cfg=init_cfg, | |||
input_transform='multiple_select', | |||
**kwargs) | |||
self.num_things_classes = num_things_classes | |||
self.num_stuff_classes = num_stuff_classes | |||
self.num_classes = self.num_things_classes + self.num_stuff_classes | |||
self.num_queries = num_queries | |||
self.num_transformer_feat_level = num_transformer_feat_level | |||
self.num_heads = transformer_decoder.transformerlayers. \ | |||
attn_cfgs.num_heads | |||
self.num_transformer_decoder_layers = transformer_decoder.num_layers | |||
assert pixel_decoder.encoder.transformerlayers.attn_cfgs.num_levels == num_transformer_feat_level | |||
pixel_decoder_ = copy.deepcopy(pixel_decoder) | |||
pixel_decoder_.update( | |||
in_channels=in_channels, | |||
feat_channels=feat_channels, | |||
out_channels=out_channels) | |||
self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1] | |||
self.transformer_decoder = build_transformer_layer_sequence( | |||
transformer_decoder) | |||
self.decoder_embed_dims = self.transformer_decoder.embed_dims | |||
self.decoder_input_projs = ModuleList() | |||
# from low resolution to high resolution | |||
for _ in range(num_transformer_feat_level): | |||
if (self.decoder_embed_dims != feat_channels | |||
or enforce_decoder_input_project): | |||
self.decoder_input_projs.append( | |||
Conv2d( | |||
feat_channels, self.decoder_embed_dims, kernel_size=1)) | |||
else: | |||
self.decoder_input_projs.append(nn.Identity()) | |||
self.decoder_positional_encoding = build_positional_encoding( | |||
positional_encoding) | |||
self.query_embed = nn.Embedding(self.num_queries, feat_channels) | |||
self.query_feat = nn.Embedding(self.num_queries, feat_channels) | |||
# from low resolution to high resolution | |||
self.level_embed = nn.Embedding(self.num_transformer_feat_level, | |||
feat_channels) | |||
self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) | |||
self.mask_embed = nn.Sequential( | |||
nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), | |||
nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), | |||
nn.Linear(feat_channels, out_channels)) | |||
self.conv_seg = None # fix a bug here (conv_seg is not used) | |||
self.test_cfg = test_cfg | |||
self.train_cfg = train_cfg | |||
if train_cfg: | |||
self.assigner = build_assigner(self.train_cfg.assigner) | |||
self.sampler = build_sampler(self.train_cfg.sampler, context=self) | |||
self.num_points = self.train_cfg.get('num_points', 12544) | |||
self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0) | |||
self.importance_sample_ratio = self.train_cfg.get( | |||
'importance_sample_ratio', 0.75) | |||
self.class_weight = loss_cls.class_weight | |||
self.loss_cls = build_loss(loss_cls) | |||
self.loss_mask = build_loss(loss_mask) | |||
self.loss_dice = build_loss(loss_dice) | |||
def init_weights(self): | |||
for m in self.decoder_input_projs: | |||
if isinstance(m, Conv2d): | |||
caffe2_xavier_init(m, bias=0) | |||
self.pixel_decoder.init_weights() | |||
for p in self.transformer_decoder.parameters(): | |||
if p.dim() > 1: | |||
nn.init.xavier_normal_(p) | |||
def get_targets(self, cls_scores_list, mask_preds_list, gt_labels_list, | |||
gt_masks_list, img_metas): | |||
"""Compute classification and mask targets for all images for a decoder | |||
layer. | |||
Args: | |||
cls_scores_list (list[Tensor]): Mask score logits from a single | |||
decoder layer for all images. Each with shape [num_queries, | |||
cls_out_channels]. | |||
mask_preds_list (list[Tensor]): Mask logits from a single decoder | |||
layer for all images. Each with shape [num_queries, h, w]. | |||
gt_labels_list (list[Tensor]): Ground truth class indices for all | |||
images. Each with shape (n, ), n is the sum of number of stuff | |||
type and number of instance in a image. | |||
gt_masks_list (list[Tensor]): Ground truth mask for each image, | |||
each with shape (n, h, w). | |||
img_metas (list[dict]): List of image meta information. | |||
Returns: | |||
tuple[list[Tensor]]: a tuple containing the following targets. | |||
- labels_list (list[Tensor]): Labels of all images. | |||
Each with shape [num_queries, ]. | |||
- label_weights_list (list[Tensor]): Label weights of all | |||
images.Each with shape [num_queries, ]. | |||
- mask_targets_list (list[Tensor]): Mask targets of all images. | |||
Each with shape [num_queries, h, w]. | |||
- mask_weights_list (list[Tensor]): Mask weights of all images. | |||
Each with shape [num_queries, ]. | |||
- num_total_pos (int): Number of positive samples in all | |||
images. | |||
- num_total_neg (int): Number of negative samples in all | |||
images. | |||
""" | |||
(labels_list, label_weights_list, mask_targets_list, mask_weights_list, | |||
pos_inds_list, | |||
neg_inds_list) = multi_apply(self._get_target_single, cls_scores_list, | |||
mask_preds_list, gt_labels_list, | |||
gt_masks_list, img_metas) | |||
num_total_pos = sum((inds.numel() for inds in pos_inds_list)) | |||
num_total_neg = sum((inds.numel() for inds in neg_inds_list)) | |||
return (labels_list, label_weights_list, mask_targets_list, | |||
mask_weights_list, num_total_pos, num_total_neg) | |||
def _get_target_single(self, cls_score, mask_pred, gt_labels, gt_masks, | |||
img_metas): | |||
"""Compute classification and mask targets for one image. | |||
Args: | |||
cls_score (Tensor): Mask score logits from a single decoder layer | |||
for one image. Shape (num_queries, cls_out_channels). | |||
mask_pred (Tensor): Mask logits for a single decoder layer for one | |||
image. Shape (num_queries, h, w). | |||
gt_labels (Tensor): Ground truth class indices for one image with | |||
shape (num_gts, ). | |||
gt_masks (Tensor): Ground truth mask for each image, each with | |||
shape (num_gts, h, w). | |||
img_metas (dict): Image informtation. | |||
Returns: | |||
tuple[Tensor]: A tuple containing the following for one image. | |||
- labels (Tensor): Labels of each image. \ | |||
shape (num_queries, ). | |||
- label_weights (Tensor): Label weights of each image. \ | |||
shape (num_queries, ). | |||
- mask_targets (Tensor): Mask targets of each image. \ | |||
shape (num_queries, h, w). | |||
- mask_weights (Tensor): Mask weights of each image. \ | |||
shape (num_queries, ). | |||
- pos_inds (Tensor): Sampled positive indices for each \ | |||
image. | |||
- neg_inds (Tensor): Sampled negative indices for each \ | |||
image. | |||
""" | |||
# sample points | |||
num_queries = cls_score.shape[0] | |||
num_gts = gt_labels.shape[0] | |||
point_coords = torch.rand((1, self.num_points, 2), | |||
device=cls_score.device) | |||
# shape (num_queries, num_points) | |||
mask_points_pred = point_sample( | |||
mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1, | |||
1)).squeeze(1) | |||
# shape (num_gts, num_points) | |||
gt_points_masks = point_sample( | |||
gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1, | |||
1)).squeeze(1) | |||
# assign and sample | |||
assign_result = self.assigner.assign(cls_score, mask_points_pred, | |||
gt_labels, gt_points_masks, | |||
img_metas) | |||
sampling_result = self.sampler.sample(assign_result, mask_pred, | |||
gt_masks) | |||
pos_inds = sampling_result.pos_inds | |||
neg_inds = sampling_result.neg_inds | |||
# label target | |||
labels = gt_labels.new_full((self.num_queries, ), | |||
self.num_classes, | |||
dtype=torch.long) | |||
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] | |||
label_weights = gt_labels.new_ones((self.num_queries, )) | |||
# mask target | |||
mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds] | |||
mask_weights = mask_pred.new_zeros((self.num_queries, )) | |||
mask_weights[pos_inds] = 1.0 | |||
return (labels, label_weights, mask_targets, mask_weights, pos_inds, | |||
neg_inds) | |||
def loss_single(self, cls_scores, mask_preds, gt_labels_list, | |||
gt_masks_list, img_metas): | |||
"""Loss function for outputs from a single decoder layer. | |||
Args: | |||
cls_scores (Tensor): Mask score logits from a single decoder layer | |||
for all images. Shape (batch_size, num_queries, | |||
cls_out_channels). Note `cls_out_channels` should includes | |||
background. | |||
mask_preds (Tensor): Mask logits for a pixel decoder for all | |||
images. Shape (batch_size, num_queries, h, w). | |||
gt_labels_list (list[Tensor]): Ground truth class indices for each | |||
image, each with shape (num_gts, ). | |||
gt_masks_list (list[Tensor]): Ground truth mask for each image, | |||
each with shape (num_gts, h, w). | |||
img_metas (list[dict]): List of image meta information. | |||
Returns: | |||
tuple[Tensor]: Loss components for outputs from a single \ | |||
decoder layer. | |||
""" | |||
num_imgs = cls_scores.size(0) | |||
cls_scores_list = [cls_scores[i] for i in range(num_imgs)] | |||
mask_preds_list = [mask_preds[i] for i in range(num_imgs)] | |||
(labels_list, label_weights_list, mask_targets_list, mask_weights_list, | |||
num_total_pos, | |||
num_total_neg) = self.get_targets(cls_scores_list, mask_preds_list, | |||
gt_labels_list, gt_masks_list, | |||
img_metas) | |||
# shape (batch_size, num_queries) | |||
labels = torch.stack(labels_list, dim=0) | |||
# shape (batch_size, num_queries) | |||
label_weights = torch.stack(label_weights_list, dim=0) | |||
# shape (num_total_gts, h, w) | |||
mask_targets = torch.cat(mask_targets_list, dim=0) | |||
# shape (batch_size, num_queries) | |||
mask_weights = torch.stack(mask_weights_list, dim=0) | |||
# classfication loss | |||
# shape (batch_size * num_queries, ) | |||
cls_scores = cls_scores.flatten(0, 1) | |||
labels = labels.flatten(0, 1) | |||
label_weights = label_weights.flatten(0, 1) | |||
class_weight = cls_scores.new_tensor(self.class_weight) | |||
loss_cls = self.loss_cls( | |||
cls_scores, | |||
labels, | |||
label_weights, | |||
avg_factor=class_weight[labels].sum()) | |||
num_total_masks = reduce_mean(cls_scores.new_tensor([num_total_pos])) | |||
num_total_masks = max(num_total_masks, 1) | |||
# extract positive ones | |||
# shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w) | |||
mask_preds = mask_preds[mask_weights > 0] | |||
if mask_targets.shape[0] == 0: | |||
# zero match | |||
loss_dice = mask_preds.sum() | |||
loss_mask = mask_preds.sum() | |||
return loss_cls, loss_mask, loss_dice | |||
with torch.no_grad(): | |||
points_coords = get_uncertain_point_coords_with_randomness( | |||
mask_preds.unsqueeze(1), None, self.num_points, | |||
self.oversample_ratio, self.importance_sample_ratio) | |||
# shape (num_total_gts, h, w) -> (num_total_gts, num_points) | |||
mask_point_targets = point_sample( | |||
mask_targets.unsqueeze(1).float(), points_coords).squeeze(1) | |||
# shape (num_queries, h, w) -> (num_queries, num_points) | |||
mask_point_preds = point_sample( | |||
mask_preds.unsqueeze(1), points_coords).squeeze(1) | |||
# dice loss | |||
loss_dice = self.loss_dice( | |||
mask_point_preds, mask_point_targets, avg_factor=num_total_masks) | |||
# mask loss | |||
# shape (num_queries, num_points) -> (num_queries * num_points, ) | |||
mask_point_preds = mask_point_preds.reshape(-1, 1) | |||
# shape (num_total_gts, num_points) -> (num_total_gts * num_points, ) | |||
mask_point_targets = mask_point_targets.reshape(-1) | |||
loss_mask = self.loss_mask( | |||
mask_point_preds, | |||
mask_point_targets, | |||
avg_factor=num_total_masks * self.num_points) | |||
return loss_cls, loss_mask, loss_dice | |||
@force_fp32(apply_to=('all_cls_scores', 'all_mask_preds')) | |||
def loss(self, all_cls_scores, all_mask_preds, gt_labels_list, | |||
gt_masks_list, img_metas): | |||
"""Loss function. | |||
Args: | |||
all_cls_scores (Tensor): Classification scores for all decoder | |||
layers with shape [num_decoder, batch_size, num_queries, | |||
cls_out_channels]. | |||
all_mask_preds (Tensor): Mask scores for all decoder layers with | |||
shape [num_decoder, batch_size, num_queries, h, w]. | |||
gt_labels_list (list[Tensor]): Ground truth class indices for each | |||
image with shape (n, ). n is the sum of number of stuff type | |||
and number of instance in a image. | |||
gt_masks_list (list[Tensor]): Ground truth mask for each image with | |||
shape (n, h, w). | |||
img_metas (list[dict]): List of image meta information. | |||
Returns: | |||
dict[str, Tensor]: A dictionary of loss components. | |||
""" | |||
num_dec_layers = len(all_cls_scores) | |||
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] | |||
all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers)] | |||
img_metas_list = [img_metas for _ in range(num_dec_layers)] | |||
losses_cls, losses_mask, losses_dice = multi_apply( | |||
self.loss_single, all_cls_scores, all_mask_preds, | |||
all_gt_labels_list, all_gt_masks_list, img_metas_list) | |||
loss_dict = dict() | |||
# loss from the last decoder layer | |||
loss_dict['loss_cls'] = losses_cls[-1] | |||
loss_dict['loss_mask'] = losses_mask[-1] | |||
loss_dict['loss_dice'] = losses_dice[-1] | |||
# loss from other decoder layers | |||
num_dec_layer = 0 | |||
for loss_cls_i, loss_mask_i, loss_dice_i in zip( | |||
losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]): | |||
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i | |||
loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i | |||
loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i | |||
num_dec_layer += 1 | |||
return loss_dict | |||
def forward_head(self, decoder_out, mask_feature, attn_mask_target_size): | |||
"""Forward for head part which is called after every decoder layer. | |||
Args: | |||
decoder_out (Tensor): in shape (num_queries, batch_size, c). | |||
mask_feature (Tensor): in shape (batch_size, c, h, w). | |||
attn_mask_target_size (tuple[int, int]): target attention | |||
mask size. | |||
Returns: | |||
tuple: A tuple contain three elements. | |||
- cls_pred (Tensor): Classification scores in shape \ | |||
(batch_size, num_queries, cls_out_channels). \ | |||
Note `cls_out_channels` should includes background. | |||
- mask_pred (Tensor): Mask scores in shape \ | |||
(batch_size, num_queries,h, w). | |||
- attn_mask (Tensor): Attention mask in shape \ | |||
(batch_size * num_heads, num_queries, h, w). | |||
""" | |||
decoder_out = self.transformer_decoder.post_norm(decoder_out) | |||
decoder_out = decoder_out.transpose(0, 1) | |||
# shape (num_queries, batch_size, c) | |||
cls_pred = self.cls_embed(decoder_out) | |||
# shape (num_queries, batch_size, c) | |||
mask_embed = self.mask_embed(decoder_out) | |||
# shape (num_queries, batch_size, h, w) | |||
mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature) | |||
attn_mask = F.interpolate( | |||
mask_pred, | |||
attn_mask_target_size, | |||
mode='bilinear', | |||
align_corners=False) | |||
# shape (num_queries, batch_size, h, w) -> | |||
# (batch_size * num_head, num_queries, h, w) | |||
attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat( | |||
(1, self.num_heads, 1, 1)).flatten(0, 1) | |||
attn_mask = attn_mask.sigmoid() < 0.5 | |||
attn_mask = attn_mask.detach() | |||
return cls_pred, mask_pred, attn_mask | |||
def forward(self, feats, img_metas): | |||
"""Forward function. | |||
Args: | |||
feats (list[Tensor]): Multi scale Features from the | |||
upstream network, each is a 4D-tensor. | |||
img_metas (list[dict]): List of image information. | |||
Returns: | |||
tuple: A tuple contains two elements. | |||
- cls_pred_list (list[Tensor)]: Classification logits \ | |||
for each decoder layer. Each is a 3D-tensor with shape \ | |||
(batch_size, num_queries, cls_out_channels). \ | |||
Note `cls_out_channels` should includes background. | |||
- mask_pred_list (list[Tensor]): Mask logits for each \ | |||
decoder layer. Each with shape (batch_size, num_queries, \ | |||
h, w). | |||
""" | |||
batch_size = len(img_metas) | |||
mask_features, multi_scale_memorys = self.pixel_decoder(feats) | |||
# multi_scale_memorys (from low resolution to high resolution) | |||
decoder_inputs = [] | |||
decoder_positional_encodings = [] | |||
for i in range(self.num_transformer_feat_level): | |||
decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i]) | |||
# shape (batch_size, c, h, w) -> (h*w, batch_size, c) | |||
decoder_input = decoder_input.flatten(2).permute(2, 0, 1) | |||
level_embed = self.level_embed.weight[i].view(1, 1, -1) | |||
decoder_input = decoder_input + level_embed | |||
# shape (batch_size, c, h, w) -> (h*w, batch_size, c) | |||
mask = decoder_input.new_zeros( | |||
(batch_size, ) + multi_scale_memorys[i].shape[-2:], | |||
dtype=torch.bool) | |||
decoder_positional_encoding = self.decoder_positional_encoding( | |||
mask) | |||
decoder_positional_encoding = decoder_positional_encoding.flatten( | |||
2).permute(2, 0, 1) | |||
decoder_inputs.append(decoder_input) | |||
decoder_positional_encodings.append(decoder_positional_encoding) | |||
# shape (num_queries, c) -> (num_queries, batch_size, c) | |||
query_feat = self.query_feat.weight.unsqueeze(1).repeat( | |||
(1, batch_size, 1)) | |||
query_embed = self.query_embed.weight.unsqueeze(1).repeat( | |||
(1, batch_size, 1)) | |||
cls_pred_list = [] | |||
mask_pred_list = [] | |||
cls_pred, mask_pred, attn_mask = self.forward_head( | |||
query_feat, mask_features, multi_scale_memorys[0].shape[-2:]) | |||
cls_pred_list.append(cls_pred) | |||
mask_pred_list.append(mask_pred) | |||
for i in range(self.num_transformer_decoder_layers): | |||
level_idx = i % self.num_transformer_feat_level | |||
# if a mask is all True(all background), then set it all False. | |||
attn_mask[torch.where( | |||
attn_mask.sum(-1) == attn_mask.shape[-1])] = False | |||
# cross_attn + self_attn | |||
layer = self.transformer_decoder.layers[i] | |||
attn_masks = [attn_mask, None] | |||
query_feat = layer( | |||
query=query_feat, | |||
key=decoder_inputs[level_idx], | |||
value=decoder_inputs[level_idx], | |||
query_pos=query_embed, | |||
key_pos=decoder_positional_encodings[level_idx], | |||
attn_masks=attn_masks, | |||
query_key_padding_mask=None, | |||
# here we do not apply masking on padded region | |||
key_padding_mask=None) | |||
cls_pred, mask_pred, attn_mask = self.forward_head( | |||
query_feat, mask_features, multi_scale_memorys[ | |||
(i + 1) % self.num_transformer_feat_level].shape[-2:]) | |||
cls_pred_list.append(cls_pred) | |||
mask_pred_list.append(mask_pred) | |||
return cls_pred_list, mask_pred_list | |||
def forward_train(self, x, img_metas, gt_semantic_seg, gt_labels, | |||
gt_masks): | |||
"""Forward function for training mode. | |||
Args: | |||
x (list[Tensor]): Multi-level features from the upstream network, | |||
each is a 4D-tensor. | |||
img_metas (list[Dict]): List of image information. | |||
gt_semantic_seg (list[tensor]):Each element is the ground truth | |||
of semantic segmentation with the shape (N, H, W). | |||
train_cfg (dict): The training config, which not been used in | |||
maskformer. | |||
gt_labels (list[Tensor]): Each element is ground truth labels of | |||
each box, shape (num_gts,). | |||
gt_masks (list[BitmapMasks]): Each element is masks of instances | |||
of a image, shape (num_gts, h, w). | |||
Returns: | |||
losses (dict[str, Tensor]): a dictionary of loss components | |||
""" | |||
# forward | |||
all_cls_scores, all_mask_preds = self(x, img_metas) | |||
# loss | |||
losses = self.loss(all_cls_scores, all_mask_preds, gt_labels, gt_masks, | |||
img_metas) | |||
return losses | |||
def forward_test(self, inputs, img_metas, test_cfg): | |||
"""Test segment without test-time aumengtation. | |||
Only the output of last decoder layers was used. | |||
Args: | |||
inputs (list[Tensor]): Multi-level features from the | |||
upstream network, each is a 4D-tensor. | |||
img_metas (list[dict]): List of image information. | |||
test_cfg (dict): Testing config. | |||
Returns: | |||
seg_mask (Tensor): Predicted semantic segmentation logits. | |||
""" | |||
all_cls_scores, all_mask_preds = self(inputs, img_metas) | |||
cls_score, mask_pred = all_cls_scores[-1], all_mask_preds[-1] | |||
ori_h, ori_w, _ = img_metas[0]['ori_shape'] | |||
# semantic inference | |||
cls_score = F.softmax(cls_score, dim=-1)[..., :-1] | |||
mask_pred = mask_pred.sigmoid() | |||
seg_mask = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred) | |||
return seg_mask |
@@ -0,0 +1,3 @@ | |||
from .encoder_decoder_mask2former import EncoderDecoderMask2Former | |||
__all__ = ['EncoderDecoderMask2Former'] |
@@ -0,0 +1,314 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import warnings | |||
from abc import ABCMeta, abstractmethod | |||
from collections import OrderedDict | |||
import mmcv | |||
import numpy as np | |||
import torch | |||
import torch.distributed as dist | |||
from mmcv.runner import BaseModule, auto_fp16 | |||
class BaseSegmentor(BaseModule, metaclass=ABCMeta): | |||
"""Base class for segmentors.""" | |||
def __init__(self, init_cfg=None): | |||
super(BaseSegmentor, self).__init__(init_cfg) | |||
self.fp16_enabled = False | |||
@property | |||
def with_neck(self): | |||
"""bool: whether the segmentor has neck""" | |||
return hasattr(self, 'neck') and self.neck is not None | |||
@property | |||
def with_auxiliary_head(self): | |||
"""bool: whether the segmentor has auxiliary head""" | |||
return hasattr(self, | |||
'auxiliary_head') and self.auxiliary_head is not None | |||
@property | |||
def with_decode_head(self): | |||
"""bool: whether the segmentor has decode head""" | |||
return hasattr(self, 'decode_head') and self.decode_head is not None | |||
@abstractmethod | |||
def extract_feat(self, imgs): | |||
"""Placeholder for extract features from images.""" | |||
pass | |||
@abstractmethod | |||
def encode_decode(self, img, img_metas): | |||
"""Placeholder for encode images with backbone and decode into a | |||
semantic segmentation map of the same size as input.""" | |||
pass | |||
@abstractmethod | |||
def forward_train(self, imgs, img_metas, **kwargs): | |||
"""Placeholder for Forward function for training.""" | |||
pass | |||
@abstractmethod | |||
def simple_test(self, img, img_meta, **kwargs): | |||
"""Placeholder for single image test.""" | |||
pass | |||
@abstractmethod | |||
def aug_test(self, imgs, img_metas, **kwargs): | |||
"""Placeholder for augmentation test.""" | |||
pass | |||
def forward_test(self, imgs, img_metas, **kwargs): | |||
""" | |||
Args: | |||
imgs (List[Tensor]): the outer list indicates test-time | |||
augmentations and inner Tensor should have a shape NxCxHxW, | |||
which contains all images in the batch. | |||
img_metas (List[List[dict]]): the outer list indicates test-time | |||
augs (multiscale, flip, etc.) and the inner list indicates | |||
images in a batch. | |||
""" | |||
for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: | |||
if not isinstance(var, list): | |||
raise TypeError(f'{name} must be a list, but got ' | |||
f'{type(var)}') | |||
num_augs = len(imgs) | |||
if num_augs != len(img_metas): | |||
raise ValueError(f'num of augmentations ({len(imgs)}) != ' | |||
f'num of image meta ({len(img_metas)})') | |||
# all images in the same aug batch all of the same ori_shape and pad | |||
# shape | |||
def tensor_to_tuple(input_tensor): | |||
return tuple(input_tensor.cpu().numpy()) | |||
for img_meta in img_metas: | |||
ori_shapes = [_['ori_shape'] for _ in img_meta] | |||
if isinstance(ori_shapes[0], torch.Tensor): | |||
assert all( | |||
tensor_to_tuple(shape) == tensor_to_tuple(ori_shapes[0]) | |||
for shape in ori_shapes) | |||
else: | |||
assert all(shape == ori_shapes[0] for shape in ori_shapes) | |||
img_shapes = [_['img_shape'] for _ in img_meta] | |||
if isinstance(img_shapes[0], torch.Tensor): | |||
assert all( | |||
tensor_to_tuple(shape) == tensor_to_tuple(img_shapes[0]) | |||
for shape in img_shapes) | |||
else: | |||
assert all(shape == img_shapes[0] for shape in img_shapes) | |||
pad_shapes = [_['pad_shape'] for _ in img_meta] | |||
if isinstance(pad_shapes[0], torch.Tensor): | |||
assert all( | |||
tensor_to_tuple(shape) == tensor_to_tuple(pad_shapes[0]) | |||
for shape in pad_shapes) | |||
else: | |||
assert all(shape == pad_shapes[0] for shape in pad_shapes) | |||
if num_augs == 1: | |||
return self.simple_test(imgs[0], img_metas[0], **kwargs) | |||
else: | |||
return self.aug_test(imgs, img_metas, **kwargs) | |||
@auto_fp16(apply_to=('img', )) | |||
def forward(self, img, img_metas, return_loss=True, **kwargs): | |||
"""Calls either :func:`forward_train` or :func:`forward_test` depending | |||
on whether ``return_loss`` is ``True``. | |||
Note this setting will change the expected inputs. When | |||
``return_loss=True``, img and img_meta are single-nested (i.e. Tensor | |||
and List[dict]), and when ``resturn_loss=False``, img and img_meta | |||
should be double nested (i.e. List[Tensor], List[List[dict]]), with | |||
the outer list indicating test time augmentations. | |||
""" | |||
if return_loss: | |||
return self.forward_train(img, img_metas, **kwargs) | |||
else: | |||
return self.forward_test(img, img_metas, **kwargs) | |||
def train_step(self, data_batch, optimizer, **kwargs): | |||
"""The iteration step during training. | |||
This method defines an iteration step during training, except for the | |||
back propagation and optimizer updating, which are done in an optimizer | |||
hook. Note that in some complicated cases or models, the whole process | |||
including back propagation and optimizer updating is also defined in | |||
this method, such as GAN. | |||
Args: | |||
data (dict): The output of dataloader. | |||
optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of | |||
runner is passed to ``train_step()``. This argument is unused | |||
and reserved. | |||
Returns: | |||
dict: It should contain at least 3 keys: ``loss``, ``log_vars``, | |||
``num_samples``. | |||
``loss`` is a tensor for back propagation, which can be a | |||
weighted sum of multiple losses. | |||
``log_vars`` contains all the variables to be sent to the | |||
logger. | |||
``num_samples`` indicates the batch size (when the model is | |||
DDP, it means the batch size on each GPU), which is used for | |||
averaging the logs. | |||
""" | |||
losses = self(**data_batch) | |||
loss, log_vars = self._parse_losses(losses) | |||
outputs = dict( | |||
loss=loss, | |||
log_vars=log_vars, | |||
num_samples=len(data_batch['img_metas'])) | |||
return outputs | |||
def val_step(self, data_batch, optimizer=None, **kwargs): | |||
"""The iteration step during validation. | |||
This method shares the same signature as :func:`train_step`, but used | |||
during val epochs. Note that the evaluation after training epochs is | |||
not implemented with this method, but an evaluation hook. | |||
""" | |||
losses = self(**data_batch) | |||
loss, log_vars = self._parse_losses(losses) | |||
log_vars_ = dict() | |||
for loss_name, loss_value in log_vars.items(): | |||
k = loss_name + '_val' | |||
log_vars_[k] = loss_value | |||
outputs = dict( | |||
loss=loss, | |||
log_vars=log_vars_, | |||
num_samples=len(data_batch['img_metas'])) | |||
return outputs | |||
@staticmethod | |||
def _parse_losses(losses): | |||
"""Parse the raw outputs (losses) of the network. | |||
Args: | |||
losses (dict): Raw output of the network, which usually contain | |||
losses and other necessary information. | |||
Returns: | |||
tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor | |||
which may be a weighted sum of all losses, log_vars contains | |||
all the variables to be sent to the logger. | |||
""" | |||
log_vars = OrderedDict() | |||
for loss_name, loss_value in losses.items(): | |||
if isinstance(loss_value, torch.Tensor): | |||
log_vars[loss_name] = loss_value.mean() | |||
elif isinstance(loss_value, list): | |||
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) | |||
else: | |||
raise TypeError( | |||
f'{loss_name} is not a tensor or list of tensors') | |||
loss = sum(_value for _key, _value in log_vars.items() | |||
if 'loss' in _key) | |||
# If the loss_vars has different length, raise assertion error | |||
# to prevent GPUs from infinite waiting. | |||
if dist.is_available() and dist.is_initialized(): | |||
log_var_length = torch.tensor(len(log_vars), device=loss.device) | |||
dist.all_reduce(log_var_length) | |||
message = (f'rank {dist.get_rank()}' | |||
+ f' len(log_vars): {len(log_vars)}' + ' keys: ' | |||
+ ','.join(log_vars.keys()) + '\n') | |||
assert log_var_length == len(log_vars) * dist.get_world_size(), \ | |||
'loss log variables are different across GPUs!\n' + message | |||
log_vars['loss'] = loss | |||
for loss_name, loss_value in log_vars.items(): | |||
# reduce loss when distributed training | |||
if dist.is_available() and dist.is_initialized(): | |||
loss_value = loss_value.data.clone() | |||
dist.all_reduce(loss_value.div_(dist.get_world_size())) | |||
log_vars[loss_name] = loss_value.item() | |||
return loss, log_vars | |||
def show_result(self, | |||
img, | |||
result, | |||
palette=None, | |||
win_name='', | |||
show=False, | |||
wait_time=0, | |||
out_file=None, | |||
opacity=0.5): | |||
"""Draw `result` over `img`. | |||
Args: | |||
img (str or Tensor): The image to be displayed. | |||
result (Tensor): The semantic segmentation results to draw over | |||
`img`. | |||
palette (list[list[int]]] | np.ndarray | None): The palette of | |||
segmentation map. If None is given, random palette will be | |||
generated. Default: None | |||
win_name (str): The window name. | |||
wait_time (int): Value of waitKey param. | |||
Default: 0. | |||
show (bool): Whether to show the image. | |||
Default: False. | |||
out_file (str or None): The filename to write the image. | |||
Default: None. | |||
opacity(float): Opacity of painted segmentation map. | |||
Default 0.5. | |||
Must be in (0, 1] range. | |||
Returns: | |||
img (Tensor): Only if not `show` or `out_file` | |||
""" | |||
img = mmcv.imread(img) | |||
img = img.copy() | |||
seg = result[0] | |||
if palette is None: | |||
if self.PALETTE is None: | |||
# Get random state before set seed, | |||
# and restore random state later. | |||
# It will prevent loss of randomness, as the palette | |||
# may be different in each iteration if not specified. | |||
# See: https://github.com/open-mmlab/mmdetection/issues/5844 | |||
state = np.random.get_state() | |||
np.random.seed(42) | |||
# random palette | |||
palette = np.random.randint( | |||
0, 255, size=(len(self.CLASSES), 3)) | |||
np.random.set_state(state) | |||
else: | |||
palette = self.PALETTE | |||
palette = np.array(palette) | |||
assert palette.shape[0] == len(self.CLASSES) | |||
assert palette.shape[1] == 3 | |||
assert len(palette.shape) == 2 | |||
assert 0 < opacity <= 1.0 | |||
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) | |||
for label, color in enumerate(palette): | |||
color_seg[seg == label, :] = color | |||
# convert to BGR | |||
color_seg = color_seg[..., ::-1] | |||
img = img * (1 - opacity) + color_seg * opacity | |||
img = img.astype(np.uint8) | |||
# if out_file specified, do not show image in window | |||
if out_file is not None: | |||
show = False | |||
if show: | |||
mmcv.imshow(img, win_name, wait_time) | |||
if out_file is not None: | |||
mmcv.imwrite(img, out_file) | |||
if not (show or out_file): | |||
warnings.warn('show==False and out_file is not specified, only ' | |||
'result image will be returned') | |||
return img |
@@ -0,0 +1,303 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from mmdet.models import builder | |||
from mmdet.models.builder import DETECTORS | |||
from ...utils import add_prefix, seg_resize | |||
from .base_segmentor import BaseSegmentor | |||
@DETECTORS.register_module() | |||
class EncoderDecoderMask2Former(BaseSegmentor): | |||
"""Encoder Decoder segmentors. | |||
EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. | |||
Note that auxiliary_head is only used for deep supervision during training, | |||
which could be dumped during inference. | |||
""" | |||
def __init__(self, | |||
backbone, | |||
decode_head, | |||
neck=None, | |||
auxiliary_head=None, | |||
train_cfg=None, | |||
test_cfg=None, | |||
pretrained=None, | |||
init_cfg=None): | |||
super(EncoderDecoderMask2Former, self).__init__(init_cfg) | |||
if pretrained is not None: | |||
assert backbone.get('pretrained') is None, \ | |||
'both backbone and segmentor set pretrained weight' | |||
backbone.pretrained = pretrained | |||
self.backbone = builder.build_backbone(backbone) | |||
if neck is not None: | |||
self.neck = builder.build_neck(neck) | |||
decode_head.update(train_cfg=train_cfg) | |||
decode_head.update(test_cfg=test_cfg) | |||
self._init_decode_head(decode_head) | |||
self._init_auxiliary_head(auxiliary_head) | |||
self.train_cfg = train_cfg | |||
self.test_cfg = test_cfg | |||
assert self.with_decode_head | |||
def _init_decode_head(self, decode_head): | |||
"""Initialize ``decode_head``""" | |||
self.decode_head = builder.build_head(decode_head) | |||
self.align_corners = self.decode_head.align_corners | |||
self.num_classes = self.decode_head.num_classes | |||
def _init_auxiliary_head(self, auxiliary_head): | |||
"""Initialize ``auxiliary_head``""" | |||
if auxiliary_head is not None: | |||
if isinstance(auxiliary_head, list): | |||
self.auxiliary_head = nn.ModuleList() | |||
for head_cfg in auxiliary_head: | |||
self.auxiliary_head.append(builder.build_head(head_cfg)) | |||
else: | |||
self.auxiliary_head = builder.build_head(auxiliary_head) | |||
def extract_feat(self, img): | |||
"""Extract features from images.""" | |||
x = self.backbone(img) | |||
if self.with_neck: | |||
x = self.neck(x) | |||
return x | |||
def encode_decode(self, img, img_metas): | |||
"""Encode images with backbone and decode into a semantic segmentation | |||
map of the same size as input.""" | |||
x = self.extract_feat(img) | |||
out = self._decode_head_forward_test(x, img_metas) | |||
out = seg_resize( | |||
input=out, | |||
size=img.shape[2:], | |||
mode='bilinear', | |||
align_corners=self.align_corners) | |||
return out | |||
def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg, | |||
**kwargs): | |||
"""Run forward function and calculate loss for decode head in | |||
training.""" | |||
losses = dict() | |||
loss_decode = self.decode_head.forward_train(x, img_metas, | |||
gt_semantic_seg, **kwargs) | |||
losses.update(add_prefix(loss_decode, 'decode')) | |||
return losses | |||
def _decode_head_forward_test(self, x, img_metas): | |||
"""Run forward function and calculate loss for decode head in | |||
inference.""" | |||
seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg) | |||
return seg_logits | |||
def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg): | |||
"""Run forward function and calculate loss for auxiliary head in | |||
training.""" | |||
losses = dict() | |||
if isinstance(self.auxiliary_head, nn.ModuleList): | |||
for idx, aux_head in enumerate(self.auxiliary_head): | |||
loss_aux = aux_head.forward_train(x, img_metas, | |||
gt_semantic_seg, | |||
self.train_cfg) | |||
losses.update(add_prefix(loss_aux, f'aux_{idx}')) | |||
else: | |||
loss_aux = self.auxiliary_head.forward_train( | |||
x, img_metas, gt_semantic_seg, self.train_cfg) | |||
losses.update(add_prefix(loss_aux, 'aux')) | |||
return losses | |||
def forward_dummy(self, img): | |||
"""Dummy forward function.""" | |||
seg_logit = self.encode_decode(img, None) | |||
return seg_logit | |||
def forward_train(self, img, img_metas, gt_semantic_seg, **kwargs): | |||
"""Forward function for training. | |||
Args: | |||
img (Tensor): Input images. | |||
img_metas (list[dict]): List of image info dict where each dict | |||
has: 'img_shape', 'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
`mmseg/datasets/pipelines/formatting.py:Collect`. | |||
gt_semantic_seg (Tensor): Semantic segmentation masks | |||
used if the architecture supports semantic segmentation task. | |||
Returns: | |||
dict[str, Tensor]: a dictionary of loss components | |||
""" | |||
x = self.extract_feat(img) | |||
losses = dict() | |||
loss_decode = self._decode_head_forward_train(x, img_metas, | |||
gt_semantic_seg, | |||
**kwargs) | |||
losses.update(loss_decode) | |||
if self.with_auxiliary_head: | |||
loss_aux = self._auxiliary_head_forward_train( | |||
x, img_metas, gt_semantic_seg) | |||
losses.update(loss_aux) | |||
return losses | |||
# TODO refactor | |||
def slide_inference(self, img, img_meta, rescale): | |||
"""Inference by sliding-window with overlap. | |||
If h_crop > h_img or w_crop > w_img, the small patch will be used to | |||
decode without padding. | |||
""" | |||
h_stride, w_stride = self.test_cfg.stride | |||
h_crop, w_crop = self.test_cfg.crop_size | |||
batch_size, _, h_img, w_img = img.size() | |||
num_classes = self.num_classes | |||
h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1 | |||
w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1 | |||
preds = img.new_zeros((batch_size, num_classes, h_img, w_img)) | |||
count_mat = img.new_zeros((batch_size, 1, h_img, w_img)) | |||
for h_idx in range(h_grids): | |||
for w_idx in range(w_grids): | |||
y1 = h_idx * h_stride | |||
x1 = w_idx * w_stride | |||
y2 = min(y1 + h_crop, h_img) | |||
x2 = min(x1 + w_crop, w_img) | |||
y1 = max(y2 - h_crop, 0) | |||
x1 = max(x2 - w_crop, 0) | |||
crop_img = img[:, :, y1:y2, x1:x2] | |||
crop_seg_logit = self.encode_decode(crop_img, img_meta) | |||
preds += F.pad(crop_seg_logit, | |||
(int(x1), int(preds.shape[3] - x2), int(y1), | |||
int(preds.shape[2] - y2))) | |||
count_mat[:, :, y1:y2, x1:x2] += 1 | |||
assert (count_mat == 0).sum() == 0 | |||
if torch.onnx.is_in_onnx_export(): | |||
# cast count_mat to constant while exporting to ONNX | |||
count_mat = torch.from_numpy( | |||
count_mat.cpu().detach().numpy()).to(device=img.device) | |||
preds = preds / count_mat | |||
def tensor_to_tuple(input_tensor): | |||
return tuple(input_tensor.cpu().numpy()) | |||
if rescale: | |||
preds = seg_resize( | |||
preds, | |||
size=tensor_to_tuple(img_meta[0]['ori_shape'])[:2] | |||
if isinstance(img_meta[0]['ori_shape'], torch.Tensor) else | |||
img_meta[0]['ori_shape'], | |||
mode='bilinear', | |||
align_corners=self.align_corners, | |||
warning=False) | |||
return preds | |||
def whole_inference(self, img, img_meta, rescale): | |||
"""Inference with full image.""" | |||
seg_logit = self.encode_decode(img, img_meta) | |||
if rescale: | |||
# support dynamic shape for onnx | |||
if torch.onnx.is_in_onnx_export(): | |||
size = img.shape[2:] | |||
else: | |||
size = img_meta[0]['ori_shape'][:2] | |||
seg_logit = seg_resize( | |||
seg_logit, | |||
size=size, | |||
mode='bilinear', | |||
align_corners=self.align_corners, | |||
warning=False) | |||
return seg_logit | |||
def inference(self, img, img_meta, rescale): | |||
"""Inference with slide/whole style. | |||
Args: | |||
img (Tensor): The input image of shape (N, 3, H, W). | |||
img_meta (dict): Image info dict where each dict has: 'img_shape', | |||
'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
`mmseg/datasets/pipelines/formatting.py:Collect`. | |||
rescale (bool): Whether rescale back to original shape. | |||
Returns: | |||
Tensor: The output segmentation map. | |||
""" | |||
assert self.test_cfg.mode in ['slide', 'whole'] | |||
ori_shape = img_meta[0]['ori_shape'] | |||
def tensor_to_tuple(input_tensor): | |||
return tuple(input_tensor.cpu().numpy()) | |||
if isinstance(ori_shape, torch.Tensor): | |||
assert all( | |||
tensor_to_tuple(_['ori_shape']) == tensor_to_tuple(ori_shape) | |||
for _ in img_meta) | |||
else: | |||
assert all(_['ori_shape'] == ori_shape for _ in img_meta) | |||
if self.test_cfg.mode == 'slide': | |||
seg_logit = self.slide_inference(img, img_meta, rescale) | |||
else: | |||
seg_logit = self.whole_inference(img, img_meta, rescale) | |||
output = F.softmax(seg_logit, dim=1) | |||
flip = img_meta[0]['flip'] | |||
if flip: | |||
flip_direction = img_meta[0]['flip_direction'] | |||
assert flip_direction in ['horizontal', 'vertical'] | |||
if flip_direction == 'horizontal': | |||
output = output.flip(dims=(3, )) | |||
elif flip_direction == 'vertical': | |||
output = output.flip(dims=(2, )) | |||
return output | |||
def simple_test(self, img, img_meta, rescale=True): | |||
"""Simple test with single image.""" | |||
seg_logit = self.inference(img, img_meta, rescale) | |||
seg_pred = seg_logit.argmax(dim=1) | |||
if torch.onnx.is_in_onnx_export(): | |||
# our inference backend only support 4D output | |||
seg_pred = seg_pred.unsqueeze(0) | |||
return seg_pred | |||
seg_pred = seg_pred.cpu().numpy() | |||
# unravel batch dim | |||
seg_pred = list(seg_pred) | |||
return seg_pred | |||
def aug_test(self, imgs, img_metas, rescale=True): | |||
"""Test with augmentations. | |||
Only rescale=True is supported. | |||
""" | |||
# aug_test rescale all imgs back to ori_shape for now | |||
assert rescale | |||
# to save memory, we get augmented seg logit inplace | |||
seg_logit = self.inference(imgs[0], img_metas[0], rescale) | |||
for i in range(1, len(imgs)): | |||
cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale) | |||
seg_logit += cur_seg_logit | |||
seg_logit /= len(imgs) | |||
seg_pred = seg_logit.argmax(dim=1) | |||
seg_pred = seg_pred.cpu().numpy() | |||
# unravel batch dim | |||
seg_pred = list(seg_pred) | |||
return seg_pred |
@@ -0,0 +1,7 @@ | |||
from .builder import build_pixel_sampler | |||
from .data_process_func import ResizeToMultiple | |||
from .seg_func import add_prefix, seg_resize | |||
__all__ = [ | |||
'seg_resize', 'add_prefix', 'build_pixel_sampler', 'ResizeToMultiple' | |||
] |
@@ -0,0 +1,11 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
from mmcv.utils import Registry, build_from_cfg | |||
PIXEL_SAMPLERS = Registry('pixel sampler') | |||
def build_pixel_sampler(cfg, **default_args): | |||
"""Build pixel sampler for segmentation map.""" | |||
return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args) |
@@ -0,0 +1,60 @@ | |||
# Copyright (c) OpenMMLab. All rights reserved. | |||
import mmcv | |||
from mmdet.datasets.builder import PIPELINES | |||
@PIPELINES.register_module() | |||
class ResizeToMultiple(object): | |||
"""Resize images & seg to multiple of divisor. | |||
Args: | |||
size_divisor (int): images and gt seg maps need to resize to multiple | |||
of size_divisor. Default: 32. | |||
interpolation (str, optional): The interpolation mode of image resize. | |||
Default: None | |||
""" | |||
def __init__(self, size_divisor=32, interpolation=None): | |||
self.size_divisor = size_divisor | |||
self.interpolation = interpolation | |||
def __call__(self, results): | |||
"""Call function to resize images, semantic segmentation map to | |||
multiple of size divisor. | |||
Args: | |||
results (dict): Result dict from loading pipeline. | |||
Returns: | |||
dict: Resized results, 'img_shape', 'pad_shape' keys are updated. | |||
""" | |||
# Align image to multiple of size divisor. | |||
img = results['img'] | |||
img = mmcv.imresize_to_multiple( | |||
img, | |||
self.size_divisor, | |||
scale_factor=1, | |||
interpolation=self.interpolation | |||
if self.interpolation else 'bilinear') | |||
results['img'] = img | |||
results['img_shape'] = img.shape | |||
results['pad_shape'] = img.shape | |||
# Align segmentation map to multiple of size divisor. | |||
for key in results.get('seg_fields', []): | |||
gt_seg = results[key] | |||
gt_seg = mmcv.imresize_to_multiple( | |||
gt_seg, | |||
self.size_divisor, | |||
scale_factor=1, | |||
interpolation='nearest') | |||
results[key] = gt_seg | |||
return results | |||
def __repr__(self): | |||
repr_str = self.__class__.__name__ | |||
repr_str += (f'(size_divisor={self.size_divisor}, ' | |||
f'interpolation={self.interpolation})') | |||
return repr_str |
@@ -0,0 +1,48 @@ | |||
# The implementation refers to the VitAdapter | |||
# available at | |||
# https://github.com/czczup/ViT-Adapter.git | |||
import warnings | |||
import torch.nn.functional as F | |||
def seg_resize(input, | |||
size=None, | |||
scale_factor=None, | |||
mode='nearest', | |||
align_corners=None, | |||
warning=True): | |||
if warning: | |||
if size is not None and align_corners: | |||
input_h, input_w = tuple(int(x) for x in input.shape[2:]) | |||
output_h, output_w = tuple(int(x) for x in size) | |||
if output_h > input_h or output_w > input_w: | |||
if ((output_h > 1 and output_w > 1 and input_h > 1 | |||
and input_w > 1) and (output_h - 1) % (input_h - 1) | |||
and (output_w - 1) % (input_w - 1)): | |||
warnings.warn( | |||
f'When align_corners={align_corners}, ' | |||
'the output would more aligned if ' | |||
f'input size {(input_h, input_w)} is `x+1` and ' | |||
f'out size {(output_h, output_w)} is `nx+1`') | |||
return F.interpolate(input, size, scale_factor, mode, align_corners) | |||
def add_prefix(inputs, prefix): | |||
"""Add prefix for dict. | |||
Args: | |||
inputs (dict): The input dict with str keys. | |||
prefix (str): The prefix to add. | |||
Returns: | |||
dict: The dict with keys updated with ``prefix``. | |||
""" | |||
outputs = dict() | |||
for name, value in inputs.items(): | |||
outputs[f'{prefix}.{name}'] = value | |||
return outputs |
@@ -0,0 +1,25 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .model import MovieSceneSegmentationModel | |||
from .datasets import MovieSceneSegmentationDataset | |||
else: | |||
_import_structure = { | |||
'model': ['MovieSceneSegmentationModel'], | |||
'datasets': ['MovieSceneSegmentationDataset'], | |||
} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,45 @@ | |||
# ------------------------------------------------------------------------------------ | |||
# BaSSL | |||
# Copyright (c) 2021 KakaoBrain. All Rights Reserved. | |||
# Licensed under the Apache License, Version 2.0 [see LICENSE for details] | |||
# Github: https://github.com/kakaobrain/bassl | |||
# ------------------------------------------------------------------------------------ | |||
from .utils.shot_encoder import resnet50 | |||
from .utils.trn import TransformerCRN | |||
def get_shot_encoder(cfg): | |||
name = cfg['model']['shot_encoder']['name'] | |||
shot_encoder_args = cfg['model']['shot_encoder'][name] | |||
if name == 'resnet': | |||
depth = shot_encoder_args['depth'] | |||
if depth == 50: | |||
shot_encoder = resnet50(**shot_encoder_args['params'], ) | |||
else: | |||
raise NotImplementedError | |||
else: | |||
raise NotImplementedError | |||
return shot_encoder | |||
def get_contextual_relation_network(cfg): | |||
crn = None | |||
if cfg['model']['contextual_relation_network']['enabled']: | |||
name = cfg['model']['contextual_relation_network']['name'] | |||
crn_args = cfg['model']['contextual_relation_network']['params'][name] | |||
if name == 'trn': | |||
sampling_name = cfg['model']['loss']['sampling_method']['name'] | |||
crn_args['neighbor_size'] = ( | |||
2 * cfg['model']['loss']['sampling_method']['params'] | |||
[sampling_name]['neighbor_size']) | |||
crn = TransformerCRN(crn_args) | |||
else: | |||
raise NotImplementedError | |||
return crn | |||
__all__ = ['get_shot_encoder', 'get_contextual_relation_network'] |
@@ -0,0 +1,192 @@ | |||
import os | |||
import os.path as osp | |||
from typing import Any, Dict | |||
import einops | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torchvision.transforms as TF | |||
from PIL import Image | |||
from shotdetect_scenedetect_lgss import shot_detect | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from .get_model import get_contextual_relation_network, get_shot_encoder | |||
from .utils.save_op import get_pred_boundary, pred2scene, scene2video | |||
logger = get_logger() | |||
@MODELS.register_module( | |||
Tasks.movie_scene_segmentation, module_name=Models.resnet50_bert) | |||
class MovieSceneSegmentationModel(TorchModel): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
"""str -- model file root.""" | |||
super().__init__(model_dir, *args, **kwargs) | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
params = torch.load(model_path, map_location='cpu') | |||
config_path = osp.join(model_dir, ModelFile.CONFIGURATION) | |||
self.cfg = Config.from_file(config_path) | |||
def load_param_with_prefix(prefix, model, src_params): | |||
own_state = model.state_dict() | |||
for name, param in own_state.items(): | |||
src_name = prefix + '.' + name | |||
own_state[name] = src_params[src_name] | |||
model.load_state_dict(own_state) | |||
self.shot_encoder = get_shot_encoder(self.cfg) | |||
load_param_with_prefix('shot_encoder', self.shot_encoder, params) | |||
self.crn = get_contextual_relation_network(self.cfg) | |||
load_param_with_prefix('crn', self.crn, params) | |||
crn_name = self.cfg.model.contextual_relation_network.name | |||
hdim = self.cfg.model.contextual_relation_network.params[crn_name][ | |||
'hidden_size'] | |||
self.head_sbd = nn.Linear(hdim, 2) | |||
load_param_with_prefix('head_sbd', self.head_sbd, params) | |||
self.test_transform = TF.Compose([ | |||
TF.Resize(size=256, interpolation=Image.BICUBIC), | |||
TF.CenterCrop(224), | |||
TF.ToTensor(), | |||
TF.Normalize( | |||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
]) | |||
self.infer_result = {'vid': [], 'sid': [], 'pred': []} | |||
sampling_method = self.cfg.dataset.sampling_method.name | |||
self.neighbor_size = self.cfg.dataset.sampling_method.params[ | |||
sampling_method].neighbor_size | |||
self.eps = 1e-5 | |||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]: | |||
data = inputs['video'] | |||
labels = inputs['label'] | |||
outputs = self.shared_step(data) | |||
loss = F.cross_entropy( | |||
outputs.squeeze(), labels.squeeze(), reduction='none') | |||
lpos = labels == 1 | |||
lneg = labels == 0 | |||
pp, nn = 1, 1 | |||
wp = (pp / float(pp + nn)) * lpos / (lpos.sum() + self.eps) | |||
wn = (nn / float(pp + nn)) * lneg / (lneg.sum() + self.eps) | |||
w = wp + wn | |||
loss = (w * loss).sum() | |||
probs = torch.argmax(outputs, dim=1) | |||
re = dict(pred=probs, loss=loss) | |||
return re | |||
def inference(self, batch): | |||
logger.info('Begin scene detect ......') | |||
bs = self.cfg.pipeline.batch_size_per_gpu | |||
sids = batch['sid'] | |||
inputs = batch['shot_feat'] | |||
shot_num = len(sids) | |||
cnt = shot_num // bs + 1 | |||
for i in range(cnt): | |||
start = i * bs | |||
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | |||
input_ = inputs[start:end] | |||
sid_ = sids[start:end] | |||
input_ = torch.stack(input_) | |||
outputs = self.shared_step(input_) # shape [b,2] | |||
prob = F.softmax(outputs, dim=1) | |||
self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) | |||
self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) | |||
self.infer_result['pred'] = np.stack(self.infer_result['pred']) | |||
assert len(self.infer_result['sid']) == len(sids) | |||
assert len(self.infer_result['pred']) == len(inputs) | |||
return self.infer_result | |||
def shared_step(self, inputs): | |||
with torch.no_grad(): | |||
# infer shot encoder | |||
shot_repr = self.extract_shot_representation(inputs) | |||
assert len(shot_repr.shape) == 3 | |||
# infer CRN | |||
_, pooled = self.crn(shot_repr, mask=None) | |||
# infer boundary score | |||
pred = self.head_sbd(pooled) | |||
return pred | |||
def save_shot_feat(self, _repr): | |||
feat = _repr.float().cpu().numpy() | |||
pth = self.cfg.dataset.img_path + '/features' | |||
os.makedirs(pth) | |||
for idx in range(_repr.shape[0]): | |||
name = f'shot_{str(idx).zfill(4)}.npy' | |||
name = osp.join(pth, name) | |||
np.save(name, feat[idx]) | |||
def extract_shot_representation(self, | |||
inputs: torch.Tensor) -> torch.Tensor: | |||
""" inputs [b s k c h w] -> output [b d] """ | |||
assert len(inputs.shape) == 6 # (B Shot Keyframe C H W) | |||
b, s, k, c, h, w = inputs.shape | |||
inputs = einops.rearrange(inputs, 'b s k c h w -> (b s) k c h w', s=s) | |||
keyframe_repr = [self.shot_encoder(inputs[:, _k]) for _k in range(k)] | |||
# [k (b s) d] -> [(b s) d] | |||
shot_repr = torch.stack(keyframe_repr).mean(dim=0) | |||
shot_repr = einops.rearrange(shot_repr, '(b s) d -> b s d', s=s) | |||
return shot_repr | |||
def postprocess(self, inputs: Dict[str, Any], **kwargs): | |||
logger.info('Generate scene .......') | |||
pred_dict = inputs['feat'] | |||
thres = self.cfg.pipeline.save_threshold | |||
anno_dict = get_pred_boundary(pred_dict, thres) | |||
scene_dict, scene_list = pred2scene(self.shot2keyf, anno_dict) | |||
if self.cfg.pipeline.save_split_scene: | |||
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | |||
print(f'Split scene video saved to {re_dir}') | |||
return len(scene_list), scene_dict | |||
def preprocess(self, inputs): | |||
logger.info('Begin shot detect......') | |||
shot_keyf_lst, anno, shot2keyf = shot_detect( | |||
inputs, **self.cfg.preprocessor.shot_detect) | |||
logger.info('Shot detect done!') | |||
single_shot_feat, sid = [], [] | |||
for idx, one_shot in enumerate(shot_keyf_lst): | |||
one_shot = [ | |||
self.test_transform(one_frame) for one_frame in one_shot | |||
] | |||
one_shot = torch.stack(one_shot, dim=0) | |||
single_shot_feat.append(one_shot) | |||
sid.append(idx) | |||
single_shot_feat = torch.stack(single_shot_feat, dim=0) | |||
shot_feat = [] | |||
for idx, one_shot in enumerate(anno): | |||
shot_idx = int(one_shot['shot_id']) + np.arange( | |||
-self.neighbor_size, self.neighbor_size + 1) | |||
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot']) | |||
_one_shot = single_shot_feat[shot_idx] | |||
shot_feat.append(_one_shot) | |||
self.shot2keyf = shot2keyf | |||
self.anno = anno | |||
return shot_feat, sid |
@@ -0,0 +1,3 @@ | |||
from .save_op import get_pred_boundary, pred2scene, scene2video | |||
from .shot_encoder import resnet50 | |||
from .trn import TransformerCRN |
@@ -0,0 +1,29 @@ | |||
# ------------------------------------------------------------------------------------ | |||
# BaSSL | |||
# Copyright (c) 2021 KakaoBrain. All Rights Reserved. | |||
# Licensed under the Apache License, Version 2.0 [see LICENSE for details] | |||
# Github: https://github.com/kakaobrain/bassl | |||
# ------------------------------------------------------------------------------------ | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
class MlpHead(nn.Module): | |||
def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128): | |||
super().__init__() | |||
self.output_dim = output_dim | |||
self.input_dim = input_dim | |||
self.hidden_dim = hidden_dim | |||
self.model = nn.Sequential( | |||
nn.Linear(self.input_dim, self.hidden_dim, bias=True), | |||
nn.ReLU(), | |||
nn.Linear(self.hidden_dim, self.output_dim, bias=True), | |||
) | |||
def forward(self, x): | |||
# x shape: [b t d] where t means the number of views | |||
x = self.model(x) | |||
return F.normalize(x, dim=-1) |
@@ -0,0 +1,118 @@ | |||
# ---------------------------------------------------------------------------------- | |||
# The codes below partially refer to the SceneSeg LGSS. | |||
# Github: https://github.com/AnyiRao/SceneSeg | |||
# ---------------------------------------------------------------------------------- | |||
import os | |||
import os.path as osp | |||
import subprocess | |||
import cv2 | |||
import numpy as np | |||
from tqdm import tqdm | |||
def get_pred_boundary(pred_dict, threshold=0.5): | |||
pred = pred_dict['pred'] | |||
tmp = (pred > threshold).astype(np.int32) | |||
anno_dict = {} | |||
for idx in range(len(tmp)): | |||
anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])}) | |||
return anno_dict | |||
def pred2scene(shot2keyf, anno_dict): | |||
scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | |||
scene_dict = {} | |||
assert len(scene_list) == len(pair_list) | |||
for scene_ind, scene_item in enumerate(scene_list): | |||
scene_dict.update( | |||
{scene_ind: { | |||
'shot': pair_list[scene_ind], | |||
'frame': scene_item | |||
}}) | |||
return scene_dict, scene_list | |||
def scene2video(source_movie_fn, scene_list, thres): | |||
vcap = cv2.VideoCapture(source_movie_fn) | |||
fps = vcap.get(cv2.CAP_PROP_FPS) # video.fps | |||
out_video_dir_fn = os.path.join(os.getcwd(), | |||
f'pred_result/scene_video_{thres}') | |||
os.makedirs(out_video_dir_fn, exist_ok=True) | |||
for scene_ind, scene_item in tqdm(enumerate(scene_list)): | |||
scene = str(scene_ind).zfill(4) | |||
start_frame = int(scene_item[0]) | |||
end_frame = int(scene_item[1]) | |||
start_time, end_time = start_frame / fps, end_frame / fps | |||
duration_time = end_time - start_time | |||
out_video_fn = os.path.join(out_video_dir_fn, | |||
'scene_{}.mp4'.format(scene)) | |||
if os.path.exists(out_video_fn): | |||
continue | |||
call_list = ['ffmpeg'] | |||
call_list += ['-v', 'quiet'] | |||
call_list += [ | |||
'-y', '-ss', | |||
str(start_time), '-t', | |||
str(duration_time), '-i', source_movie_fn | |||
] | |||
call_list += ['-map_chapters', '-1'] | |||
call_list += [out_video_fn] | |||
subprocess.call(call_list) | |||
return osp.join(os.getcwd(), 'pred_result') | |||
def get_demo_scene_list(shot2keyf, anno_dict): | |||
pair_list = get_pair_list(anno_dict) | |||
scene_list = [] | |||
for pair in pair_list: | |||
start_shot, end_shot = int(pair[0]), int(pair[-1]) | |||
start_frame = shot2keyf[start_shot].split(' ')[0] | |||
end_frame = shot2keyf[end_shot].split(' ')[1] | |||
scene_list.append((start_frame, end_frame)) | |||
return scene_list, pair_list | |||
def get_pair_list(anno_dict): | |||
sort_anno_dict_key = sorted(anno_dict.keys()) | |||
tmp = 0 | |||
tmp_list = [] | |||
tmp_label_list = [] | |||
anno_list = [] | |||
anno_label_list = [] | |||
for key in sort_anno_dict_key: | |||
value = anno_dict.get(key) | |||
tmp += value | |||
tmp_list.append(key) | |||
tmp_label_list.append(value) | |||
if tmp == 1: | |||
anno_list.append(tmp_list) | |||
anno_label_list.append(tmp_label_list) | |||
tmp = 0 | |||
tmp_list = [] | |||
tmp_label_list = [] | |||
continue | |||
if key == sort_anno_dict_key[-1]: | |||
if len(tmp_list) > 0: | |||
anno_list.append(tmp_list) | |||
anno_label_list.append(tmp_label_list) | |||
if len(anno_list) == 0: | |||
return None | |||
while [] in anno_list: | |||
anno_list.remove([]) | |||
tmp_anno_list = [anno_list[0]] | |||
pair_list = [] | |||
for ind in range(len(anno_list) - 1): | |||
cont_count = int(anno_list[ind + 1][0]) - int(anno_list[ind][-1]) | |||
if cont_count > 1: | |||
pair_list.extend(tmp_anno_list) | |||
tmp_anno_list = [anno_list[ind + 1]] | |||
continue | |||
tmp_anno_list.append(anno_list[ind + 1]) | |||
pair_list.extend(tmp_anno_list) | |||
return pair_list |
@@ -0,0 +1,331 @@ | |||
""" | |||
Modified from original implementation in torchvision | |||
""" | |||
from typing import Any, Callable, List, Optional, Type, Union | |||
import torch | |||
import torch.nn as nn | |||
from torch import Tensor | |||
def conv3x3(in_planes: int, | |||
out_planes: int, | |||
stride: int = 1, | |||
groups: int = 1, | |||
dilation: int = 1) -> nn.Conv2d: | |||
"""3x3 convolution with padding""" | |||
return nn.Conv2d( | |||
in_planes, | |||
out_planes, | |||
kernel_size=3, | |||
stride=stride, | |||
padding=dilation, | |||
groups=groups, | |||
bias=False, | |||
dilation=dilation, | |||
) | |||
def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d: | |||
"""1x1 convolution""" | |||
return nn.Conv2d( | |||
in_planes, out_planes, kernel_size=1, stride=stride, bias=False) | |||
class BasicBlock(nn.Module): | |||
expansion: int = 1 | |||
def __init__( | |||
self, | |||
inplanes: int, | |||
planes: int, | |||
stride: int = 1, | |||
downsample: Optional[nn.Module] = None, | |||
groups: int = 1, | |||
base_width: int = 64, | |||
dilation: int = 1, | |||
norm_layer: Optional[Callable[..., nn.Module]] = None, | |||
) -> None: | |||
super(BasicBlock, self).__init__() | |||
if norm_layer is None: | |||
norm_layer = nn.BatchNorm2d | |||
if groups != 1 or base_width != 64: | |||
raise ValueError( | |||
'BasicBlock only supports groups=1 and base_width=64') | |||
if dilation > 1: | |||
raise NotImplementedError( | |||
'Dilation > 1 not supported in BasicBlock') | |||
# Both self.conv1 and self.downsample layers downsample the input when stride != 1 | |||
self.conv1 = conv3x3(inplanes, planes, stride) | |||
self.bn1 = norm_layer(planes) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.conv2 = conv3x3(planes, planes) | |||
self.bn2 = norm_layer(planes) | |||
self.downsample = downsample | |||
self.stride = stride | |||
def forward(self, x: Tensor) -> Tensor: | |||
identity = x | |||
out = self.conv1(x) | |||
out = self.bn1(out) | |||
out = self.relu(out) | |||
out = self.conv2(out) | |||
out = self.bn2(out) | |||
if self.downsample is not None: | |||
identity = self.downsample(x) | |||
out += identity | |||
out = self.relu(out) | |||
return out | |||
class Bottleneck(nn.Module): | |||
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) | |||
# while original implementation places the stride at the first 1x1 convolution(self.conv1) | |||
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. | |||
# This variant is also known as ResNet V1.5 and improves accuracy according to | |||
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. | |||
expansion: int = 4 | |||
def __init__( | |||
self, | |||
inplanes: int, | |||
planes: int, | |||
stride: int = 1, | |||
downsample: Optional[nn.Module] = None, | |||
groups: int = 1, | |||
base_width: int = 64, | |||
dilation: int = 1, | |||
norm_layer: Optional[Callable[..., nn.Module]] = None, | |||
) -> None: | |||
super(Bottleneck, self).__init__() | |||
if norm_layer is None: | |||
norm_layer = nn.BatchNorm2d | |||
width = int(planes * (base_width / 64.0)) * groups | |||
# Both self.conv2 and self.downsample layers downsample the input when stride != 1 | |||
self.conv1 = conv1x1(inplanes, width) | |||
self.bn1 = norm_layer(width) | |||
self.conv2 = conv3x3(width, width, stride, groups, dilation) | |||
self.bn2 = norm_layer(width) | |||
self.conv3 = conv1x1(width, planes * self.expansion) | |||
self.bn3 = norm_layer(planes * self.expansion) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.downsample = downsample | |||
self.stride = stride | |||
def forward(self, x: Tensor) -> Tensor: | |||
identity = x | |||
out = self.conv1(x) | |||
out = self.bn1(out) | |||
out = self.relu(out) | |||
out = self.conv2(out) | |||
out = self.bn2(out) | |||
out = self.relu(out) | |||
out = self.conv3(out) | |||
out = self.bn3(out) | |||
if self.downsample is not None: | |||
identity = self.downsample(x) | |||
out += identity | |||
out = self.relu(out) | |||
return out | |||
class ResNet(nn.Module): | |||
def __init__( | |||
self, | |||
block: Type[Union[BasicBlock, Bottleneck]], | |||
layers: List[int], | |||
in_channel_dim: int = 3, | |||
zero_init_residual: bool = False, | |||
use_last_block_grid: bool = False, | |||
groups: int = 1, | |||
width_per_group: int = 64, | |||
replace_stride_with_dilation: Optional[List[bool]] = None, | |||
norm_layer: Optional[Callable[..., nn.Module]] = None, | |||
) -> None: | |||
super(ResNet, self).__init__() | |||
if norm_layer is None: | |||
norm_layer = nn.BatchNorm2d | |||
self._norm_layer = norm_layer | |||
self.use_last_block_grid = use_last_block_grid | |||
self.inplanes = 64 | |||
self.dilation = 1 | |||
if replace_stride_with_dilation is None: | |||
# each element in the tuple indicates if we should replace | |||
# the 2x2 stride with a dilated convolution instead | |||
replace_stride_with_dilation = [False, False, False] | |||
if len(replace_stride_with_dilation) != 3: | |||
raise ValueError('replace_stride_with_dilation should be None ' | |||
'or a 3-element tuple, got {}'.format( | |||
replace_stride_with_dilation)) | |||
self.groups = groups | |||
self.base_width = width_per_group | |||
self.conv1 = nn.Conv2d( | |||
in_channel_dim, | |||
self.inplanes, | |||
kernel_size=7, | |||
stride=2, | |||
padding=3, | |||
bias=False, | |||
) | |||
self.bn1 = norm_layer(self.inplanes) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||
self.layer1 = self._make_layer(block, 64, layers[0]) | |||
self.layer2 = self._make_layer( | |||
block, | |||
128, | |||
layers[1], | |||
stride=2, | |||
dilate=replace_stride_with_dilation[0]) | |||
self.layer3 = self._make_layer( | |||
block, | |||
256, | |||
layers[2], | |||
stride=2, | |||
dilate=replace_stride_with_dilation[1]) | |||
self.layer4 = self._make_layer( | |||
block, | |||
512, | |||
layers[3], | |||
stride=2, | |||
dilate=replace_stride_with_dilation[2]) | |||
self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) | |||
for m in self.modules(): | |||
if isinstance(m, nn.Conv2d): | |||
nn.init.kaiming_normal_( | |||
m.weight, mode='fan_out', nonlinearity='relu') | |||
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): | |||
nn.init.constant_(m.weight, 1) | |||
nn.init.constant_(m.bias, 0) | |||
# Zero-initialize the last BN in each residual branch, | |||
# so that the residual branch starts with zeros, and each residual block behaves like an identity. | |||
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 | |||
if zero_init_residual: | |||
for m in self.modules(): | |||
if isinstance(m, Bottleneck): | |||
nn.init.constant_(m.bn3.weight, | |||
0) # type: ignore[arg-type] | |||
elif isinstance(m, BasicBlock): | |||
nn.init.constant_(m.bn2.weight, | |||
0) # type: ignore[arg-type] | |||
def _make_layer( | |||
self, | |||
block: Type[Union[BasicBlock, Bottleneck]], | |||
planes: int, | |||
blocks: int, | |||
stride: int = 1, | |||
dilate: bool = False, | |||
) -> nn.Sequential: | |||
norm_layer = self._norm_layer | |||
downsample = None | |||
previous_dilation = self.dilation | |||
if dilate: | |||
self.dilation *= stride | |||
stride = 1 | |||
if stride != 1 or self.inplanes != planes * block.expansion: | |||
downsample = nn.Sequential( | |||
conv1x1(self.inplanes, planes * block.expansion, stride), | |||
norm_layer(planes * block.expansion), | |||
) | |||
layers = [] | |||
layers.append( | |||
block( | |||
self.inplanes, | |||
planes, | |||
stride, | |||
downsample, | |||
self.groups, | |||
self.base_width, | |||
previous_dilation, | |||
norm_layer, | |||
)) | |||
self.inplanes = planes * block.expansion | |||
for _ in range(1, blocks): | |||
layers.append( | |||
block( | |||
self.inplanes, | |||
planes, | |||
groups=self.groups, | |||
base_width=self.base_width, | |||
dilation=self.dilation, | |||
norm_layer=norm_layer, | |||
)) | |||
return nn.Sequential(*layers) | |||
def _forward_impl(self, x: Tensor, grid: bool, level: List, both: bool, | |||
grid_only: bool) -> Tensor: | |||
# See note [TorchScript super()] | |||
x = self.conv1(x) | |||
x = self.bn1(x) | |||
x = self.relu(x) | |||
x = self.maxpool(x) | |||
x = self.layer1(x) | |||
x = self.layer2(x) | |||
x = self.layer3(x) | |||
if grid: | |||
x_grid = [] | |||
if 3 in level: | |||
x_grid.append(x.detach().clone()) | |||
if not both and len(level) == 1: | |||
return x_grid | |||
x = self.layer4(x) | |||
if 4 in level: | |||
x_grid.append(x.detach().clone()) | |||
if not both and len(level) == 1: | |||
return x_grid | |||
x = self.avgpool(x) | |||
x = torch.flatten(x, 1) | |||
if not grid or len(level) == 0: | |||
return x | |||
if grid_only: | |||
return x_grid | |||
if both: | |||
return x, x_grid | |||
return x | |||
def forward( | |||
self, | |||
x: Tensor, | |||
grid: bool = False, | |||
level: List = [], | |||
both: bool = False, | |||
grid_only: bool = False, | |||
) -> Tensor: | |||
return self._forward_impl(x, grid, level, both, grid_only) | |||
def resnet50(**kwargs: Any) -> ResNet: | |||
r"""ResNet-50 model from | |||
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_. | |||
""" | |||
return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) |
@@ -0,0 +1,132 @@ | |||
# ------------------------------------------------------------------------------------ | |||
# BaSSL | |||
# Copyright (c) 2021 KakaoBrain. All Rights Reserved. | |||
# Licensed under the Apache License, Version 2.0 [see LICENSE for details] | |||
# Github: https://github.com/kakaobrain/bassl | |||
# ------------------------------------------------------------------------------------ | |||
import torch | |||
import torch.nn as nn | |||
from transformers.models.bert.modeling_bert import BertEncoder | |||
class ShotEmbedding(nn.Module): | |||
def __init__(self, cfg): | |||
super().__init__() | |||
nn_size = cfg.neighbor_size + 2 # +1 for center shot, +1 for cls | |||
self.shot_embedding = nn.Linear(cfg.input_dim, cfg.hidden_size) | |||
self.position_embedding = nn.Embedding(nn_size, cfg.hidden_size) | |||
self.mask_embedding = nn.Embedding(2, cfg.input_dim, padding_idx=0) | |||
# tf naming convention for layer norm | |||
self.LayerNorm = nn.LayerNorm(cfg.hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(cfg.hidden_dropout_prob) | |||
self.register_buffer('pos_ids', | |||
torch.arange(nn_size, dtype=torch.long)) | |||
def forward( | |||
self, | |||
shot_emb: torch.Tensor, | |||
mask: torch.Tensor = None, | |||
pos_ids: torch.Tensor = None, | |||
) -> torch.Tensor: | |||
assert len(shot_emb.size()) == 3 | |||
if pos_ids is None: | |||
pos_ids = self.pos_ids | |||
# this for mask embedding (un-masked ones remain unchanged) | |||
if mask is not None: | |||
self.mask_embedding.weight.data[0, :].fill_(0) | |||
mask_emb = self.mask_embedding(mask.long()) | |||
shot_emb = (shot_emb * (1 - mask).float()[:, :, None]) + mask_emb | |||
# we set [CLS] token to averaged feature | |||
cls_emb = shot_emb.mean(dim=1) | |||
# embedding shots | |||
shot_emb = torch.cat([cls_emb[:, None, :], shot_emb], dim=1) | |||
shot_emb = self.shot_embedding(shot_emb) | |||
pos_emb = self.position_embedding(pos_ids) | |||
embeddings = shot_emb + pos_emb[None, :] | |||
embeddings = self.dropout(self.LayerNorm(embeddings)) | |||
return embeddings | |||
class TransformerCRN(nn.Module): | |||
def __init__(self, cfg): | |||
super().__init__() | |||
self.pooling_method = cfg.pooling_method | |||
self.shot_embedding = ShotEmbedding(cfg) | |||
self.encoder = BertEncoder(cfg) | |||
nn_size = cfg.neighbor_size + 2 # +1 for center shot, +1 for cls | |||
self.register_buffer( | |||
'attention_mask', | |||
self._get_extended_attention_mask( | |||
torch.ones((1, nn_size)).float()), | |||
) | |||
def forward( | |||
self, | |||
shot: torch.Tensor, | |||
mask: torch.Tensor = None, | |||
pos_ids: torch.Tensor = None, | |||
pooling_method: str = None, | |||
): | |||
if self.attention_mask.shape[1] != (shot.shape[1] + 1): | |||
n_shot = shot.shape[1] + 1 # +1 for CLS token | |||
attention_mask = self._get_extended_attention_mask( | |||
torch.ones((1, n_shot), dtype=torch.float, device=shot.device)) | |||
else: | |||
attention_mask = self.attention_mask | |||
shot_emb = self.shot_embedding(shot, mask=mask, pos_ids=pos_ids) | |||
encoded_emb = self.encoder( | |||
shot_emb, attention_mask=attention_mask).last_hidden_state | |||
return encoded_emb, self.pooler( | |||
encoded_emb, pooling_method=pooling_method) | |||
def pooler(self, sequence_output, pooling_method=None): | |||
if pooling_method is None: | |||
pooling_method = self.pooling_method | |||
if pooling_method == 'cls': | |||
return sequence_output[:, 0, :] | |||
elif pooling_method == 'avg': | |||
return sequence_output[:, 1:].mean(dim=1) | |||
elif pooling_method == 'max': | |||
return sequence_output[:, 1:].max(dim=1)[0] | |||
elif pooling_method == 'center': | |||
cidx = sequence_output.shape[1] // 2 | |||
return sequence_output[:, cidx, :] | |||
else: | |||
raise ValueError | |||
def _get_extended_attention_mask(self, attention_mask): | |||
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] | |||
# ourselves in which case we just need to make it broadcastable to all heads. | |||
if attention_mask.dim() == 3: | |||
extended_attention_mask = attention_mask[:, None, :, :] | |||
elif attention_mask.dim() == 2: | |||
extended_attention_mask = attention_mask[:, None, None, :] | |||
else: | |||
raise ValueError( | |||
f'Wrong shape for attention_mask (shape {attention_mask.shape})' | |||
) | |||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for | |||
# masked positions, this operation will create a tensor which is 0.0 for | |||
# positions we want to attend and -10000.0 for masked positions. | |||
# Since we are adding it to the raw scores before the softmax, this is | |||
# effectively the same as removing these entirely. | |||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | |||
return extended_attention_mask |
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .mmdet_model import DetectionModel | |||
from .yolox_pai import YOLOX | |||
else: | |||
_import_structure = { | |||
'mmdet_model': ['DetectionModel'], | |||
'yolox_pai': ['YOLOX'] | |||
} | |||
import sys | |||
@@ -0,0 +1,16 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from easycv.models.detection.detectors import YOLOX as _YOLOX | |||
from modelscope.metainfo import Models | |||
from modelscope.models.builder import MODELS | |||
from modelscope.models.cv.easycv_base import EasyCVBaseModel | |||
from modelscope.utils.constant import Tasks | |||
@MODELS.register_module( | |||
group_key=Tasks.image_object_detection, module_name=Models.yolox) | |||
class YOLOX(EasyCVBaseModel, _YOLOX): | |||
def __init__(self, model_dir=None, *args, **kwargs): | |||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs) | |||
_YOLOX.__init__(self, *args, **kwargs) |
@@ -13,8 +13,8 @@ from modelscope.models.cv.product_retrieval_embedding.item_embedding import ( | |||
preprocess, resnet50_embed) | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.device import create_device | |||
from modelscope.utils.logger import get_logger | |||
from modelscope.utils.torch_utils import create_device | |||
logger = get_logger() | |||
@@ -48,9 +48,8 @@ class ProductRetrievalEmbedding(TorchModel): | |||
filter_param(src_params, own_state) | |||
model.load_state_dict(own_state) | |||
cpu_flag = device == 'cpu' | |||
self.device = create_device( | |||
cpu_flag) # device.type == "cpu" or device.type == "cuda" | |||
device) # device.type == "cpu" or device.type == "cuda" | |||
self.use_gpu = self.device.type == 'cuda' | |||
# config the model path | |||
@@ -0,0 +1,21 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .realtime_detector import RealtimeDetector | |||
else: | |||
_import_structure = { | |||
'realtime_detector': ['RealtimeDetector'], | |||
} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,85 @@ | |||
import argparse | |||
import logging as logger | |||
import os | |||
import os.path as osp | |||
import time | |||
import cv2 | |||
import json | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.preprocessors import LoadImage | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .yolox.data.data_augment import ValTransform | |||
from .yolox.exp import get_exp_by_name | |||
from .yolox.utils import postprocess | |||
@MODELS.register_module( | |||
group_key=Tasks.image_object_detection, | |||
module_name=Models.realtime_object_detection) | |||
class RealtimeDetector(TorchModel): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
super().__init__(model_dir, *args, **kwargs) | |||
self.config = Config.from_file( | |||
os.path.join(self.model_dir, ModelFile.CONFIGURATION)) | |||
# model type | |||
self.exp = get_exp_by_name(self.config.model_type) | |||
# build model | |||
self.model = self.exp.get_model() | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) | |||
ckpt = torch.load(model_path, map_location='cpu') | |||
# load the model state dict | |||
self.model.load_state_dict(ckpt['model']) | |||
self.model.eval() | |||
# params setting | |||
self.exp.num_classes = self.config.num_classes | |||
self.confthre = self.config.conf_thr | |||
self.num_classes = self.exp.num_classes | |||
self.nmsthre = self.exp.nmsthre | |||
self.test_size = self.exp.test_size | |||
self.preproc = ValTransform(legacy=False) | |||
def inference(self, img): | |||
with torch.no_grad(): | |||
outputs = self.model(img) | |||
return outputs | |||
def forward(self, inputs): | |||
return self.inference(inputs) | |||
def preprocess(self, img): | |||
img = LoadImage.convert_to_ndarray(img) | |||
height, width = img.shape[:2] | |||
self.ratio = min(self.test_size[0] / img.shape[0], | |||
self.test_size[1] / img.shape[1]) | |||
img, _ = self.preproc(img, None, self.test_size) | |||
img = torch.from_numpy(img).unsqueeze(0) | |||
img = img.float() | |||
return img | |||
def postprocess(self, input): | |||
outputs = postprocess( | |||
input, | |||
self.num_classes, | |||
self.confthre, | |||
self.nmsthre, | |||
class_agnostic=True) | |||
if len(outputs) == 1: | |||
bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio | |||
scores = outputs[0][:, 5].cpu().numpy() | |||
labels = outputs[0][:, 6].cpu().int().numpy() | |||
return bboxes, scores, labels |
@@ -0,0 +1,69 @@ | |||
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | |||
""" | |||
Data augmentation functionality. Passed as callable transformations to | |||
Dataset classes. | |||
The data augmentation procedures were interpreted from @weiliu89's SSD paper | |||
http://arxiv.org/abs/1512.02325 | |||
""" | |||
import math | |||
import random | |||
import cv2 | |||
import numpy as np | |||
from ..utils import xyxy2cxcywh | |||
def preproc(img, input_size, swap=(2, 0, 1)): | |||
if len(img.shape) == 3: | |||
padded_img = np.ones( | |||
(input_size[0], input_size[1], 3), dtype=np.uint8) * 114 | |||
else: | |||
padded_img = np.ones(input_size, dtype=np.uint8) * 114 | |||
r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) | |||
resized_img = cv2.resize( | |||
img, | |||
(int(img.shape[1] * r), int(img.shape[0] * r)), | |||
interpolation=cv2.INTER_LINEAR, | |||
).astype(np.uint8) | |||
padded_img[:int(img.shape[0] * r), :int(img.shape[1] * r)] = resized_img | |||
padded_img = padded_img.transpose(swap) | |||
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) | |||
return padded_img, r | |||
class ValTransform: | |||
""" | |||
Defines the transformations that should be applied to test PIL image | |||
for input into the network | |||
dimension -> tensorize -> color adj | |||
Arguments: | |||
resize (int): input dimension to SSD | |||
rgb_means ((int,int,int)): average RGB of the dataset | |||
(104,117,123) | |||
swap ((int,int,int)): final order of channels | |||
Returns: | |||
transform (transform) : callable transform to be applied to test/val | |||
data | |||
""" | |||
def __init__(self, swap=(2, 0, 1), legacy=False): | |||
self.swap = swap | |||
self.legacy = legacy | |||
# assume input is cv2 img for now | |||
def __call__(self, img, res, input_size): | |||
img, _ = preproc(img, input_size, self.swap) | |||
if self.legacy: | |||
img = img[::-1, :, :].copy() | |||
img /= 255.0 | |||
img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1) | |||
img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1) | |||
return img, np.zeros((1, 5)) |
@@ -0,0 +1,5 @@ | |||
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | |||
from .base_exp import BaseExp | |||
from .build import get_exp_by_name | |||
from .yolox_base import Exp |
@@ -0,0 +1,12 @@ | |||
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | |||
from abc import ABCMeta, abstractmethod | |||
from torch.nn import Module | |||
class BaseExp(metaclass=ABCMeta): | |||
@abstractmethod | |||
def get_model(self) -> Module: | |||
pass |
@@ -0,0 +1,18 @@ | |||
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX | |||
import os | |||
import sys | |||
def get_exp_by_name(exp_name): | |||
exp = exp_name.replace('-', | |||
'_') # convert string like "yolox-s" to "yolox_s" | |||
if exp == 'yolox_s': | |||
from .default import YoloXSExp as YoloXExp | |||
elif exp == 'yolox_nano': | |||
from .default import YoloXNanoExp as YoloXExp | |||
elif exp == 'yolox_tiny': | |||
from .default import YoloXTinyExp as YoloXExp | |||
else: | |||
pass | |||
return YoloXExp() |