Browse Source

merge github

master
Yingda Chen 2 years ago
parent
commit
534dd810ae
100 changed files with 6295 additions and 770 deletions
  1. +1
    -0
      .dev_scripts/ci_container_test.sh
  2. +1
    -1
      .dev_scripts/dockerci.sh
  3. +30
    -0
      .github/workflows/publish.yaml
  4. +16
    -1
      README.md
  5. +3
    -0
      data/test/audios/3ch_nihaomiya10.wav
  6. +3
    -0
      data/test/audios/asr_example_ofa.wav
  7. +3
    -0
      data/test/audios/farend_speech1.wav
  8. +3
    -0
      data/test/audios/nearend_mic1.wav
  9. +3
    -0
      data/test/audios/speech_with_noise1.wav
  10. +3
    -0
      data/test/images/image_camouflag_detection.jpg
  11. +3
    -0
      data/test/images/image_depth_estimation.jpg
  12. +3
    -0
      data/test/images/license_plate_detection.jpg
  13. +3
    -0
      data/test/images/mask_face_recognition_1.jpg
  14. +3
    -0
      data/test/images/mask_face_recognition_2.jpg
  15. +3
    -0
      data/test/images/table_recognition.jpg
  16. +2
    -2
      data/test/regression/sbert_ws_zh.bin
  17. +3
    -0
      data/test/videos/video_matting_test.mp4
  18. +1
    -1
      docker/scripts/modelscope_env_init.sh
  19. +3
    -3
      docs/source/develop.md
  20. +2
    -2
      docs/source/faq.md
  21. +7
    -5
      modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
  22. +1
    -1
      modelscope/exporters/torch_model_exporter.py
  23. +2
    -2
      modelscope/fileio/file.py
  24. +38
    -24
      modelscope/hub/api.py
  25. +5
    -0
      modelscope/hub/constants.py
  26. +2
    -2
      modelscope/hub/deploy.py
  27. +44
    -28
      modelscope/hub/file_download.py
  28. +1
    -1
      modelscope/hub/git.py
  29. +1
    -0
      modelscope/hub/repository.py
  30. +41
    -0
      modelscope/metainfo.py
  31. +2
    -2
      modelscope/metrics/image_denoise_metric.py
  32. +14
    -5
      modelscope/metrics/sequence_classification_metric.py
  33. +10
    -4
      modelscope/metrics/text_generation_metric.py
  34. +16
    -10
      modelscope/metrics/token_classification_metric.py
  35. +38
    -0
      modelscope/models/audio/asr/wenet_automatic_speech_recognition.py
  36. +27
    -18
      modelscope/models/base/base_model.py
  37. +1
    -1
      modelscope/models/base/base_torch_head.py
  38. +2
    -1
      modelscope/models/base/base_torch_model.py
  39. +42
    -32
      modelscope/models/cv/action_detection/action_detection_onnx.py
  40. +2
    -2
      modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
  41. +20
    -0
      modelscope/models/cv/face_attribute_recognition/__init__.py
  42. +2
    -0
      modelscope/models/cv/face_attribute_recognition/fair_face/__init__.py
  43. +79
    -0
      modelscope/models/cv/face_attribute_recognition/fair_face/face_attribute_recognition.py
  44. +2
    -1
      modelscope/models/cv/face_detection/__init__.py
  45. +1
    -0
      modelscope/models/cv/face_detection/scrfd/__init__.py
  46. +2
    -1
      modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py
  47. +99
    -0
      modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/mobilenet.py
  48. +2
    -1
      modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py
  49. +148
    -0
      modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/tinymog.py
  50. +67
    -0
      modelscope/models/cv/face_detection/scrfd/tinymog_detect.py
  51. +200
    -0
      modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py
  52. +213
    -0
      modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py
  53. +20
    -0
      modelscope/models/cv/facial_landmark_confidence/__init__.py
  54. +2
    -0
      modelscope/models/cv/facial_landmark_confidence/flc/__init__.py
  55. +94
    -0
      modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py
  56. +152
    -0
      modelscope/models/cv/facial_landmark_confidence/flc/manual_landmark_net.py
  57. +2
    -0
      modelscope/models/cv/image_classification/backbones/__init__.py
  58. +541
    -0
      modelscope/models/cv/image_classification/backbones/nextvit.py
  59. +24
    -8
      modelscope/models/cv/image_classification/mmcls_model.py
  60. +100
    -0
      modelscope/models/cv/image_classification/utils.py
  61. +1
    -0
      modelscope/models/cv/image_depth_estimation/__init__.py
  62. +1
    -0
      modelscope/models/cv/image_depth_estimation/networks/__init__.py
  63. +215
    -0
      modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py
  64. +504
    -0
      modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py
  65. +272
    -0
      modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py
  66. +706
    -0
      modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py
  67. +365
    -0
      modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py
  68. +53
    -0
      modelscope/models/cv/image_depth_estimation/newcrfs_model.py
  69. +8
    -1
      modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py
  70. +1
    -0
      modelscope/models/cv/salient_detection/models/__init__.py
  71. +187
    -0
      modelscope/models/cv/salient_detection/models/backbone/Res2Net_v1b.py
  72. +6
    -0
      modelscope/models/cv/salient_detection/models/backbone/__init__.py
  73. +178
    -0
      modelscope/models/cv/salient_detection/models/modules.py
  74. +74
    -0
      modelscope/models/cv/salient_detection/models/senet.py
  75. +105
    -0
      modelscope/models/cv/salient_detection/models/utils.py
  76. +18
    -6
      modelscope/models/cv/salient_detection/salient_model.py
  77. +1
    -1
      modelscope/models/cv/tinynas_detection/__init__.py
  78. +7
    -4
      modelscope/models/cv/tinynas_detection/backbone/__init__.py
  79. +2
    -3
      modelscope/models/cv/tinynas_detection/backbone/darknet.py
  80. +0
    -359
      modelscope/models/cv/tinynas_detection/backbone/tinynas.py
  81. +295
    -0
      modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py
  82. +238
    -0
      modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py
  83. +1
    -1
      modelscope/models/cv/tinynas_detection/core/__init__.py
  84. +1
    -1
      modelscope/models/cv/tinynas_detection/core/base_ops.py
  85. +1
    -1
      modelscope/models/cv/tinynas_detection/core/neck_ops.py
  86. +435
    -0
      modelscope/models/cv/tinynas_detection/core/ops.py
  87. +1
    -1
      modelscope/models/cv/tinynas_detection/core/repvgg_block.py
  88. +1
    -1
      modelscope/models/cv/tinynas_detection/core/utils.py
  89. +2
    -2
      modelscope/models/cv/tinynas_detection/detector.py
  90. +4
    -1
      modelscope/models/cv/tinynas_detection/head/__init__.py
  91. +3
    -2
      modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
  92. +288
    -0
      modelscope/models/cv/tinynas_detection/head/zero_head.py
  93. +2
    -2
      modelscope/models/cv/tinynas_detection/neck/__init__.py
  94. +1
    -1
      modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
  95. +3
    -2
      modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
  96. +132
    -0
      modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py
  97. +0
    -200
      modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
  98. +1
    -1
      modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
  99. +1
    -1
      modelscope/models/cv/tinynas_detection/tinynas_detector.py
  100. +23
    -20
      modelscope/models/cv/tinynas_detection/utils.py

+ 1
- 0
.dev_scripts/ci_container_test.sh View File

@@ -1,4 +1,5 @@
if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install -r requirements/tests.txt
git config --global --add safe.directory /Maas-lib
git config --global user.email tmp


+ 1
- 1
.dev_scripts/dockerci.sh View File

@@ -7,7 +7,7 @@ gpus='0,1 2,3 4,5 6,7'
cpu_sets='45-58 31-44 16-30 0-15'
cpu_sets_arr=($cpu_sets)
is_get_file_lock=false
CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml'
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml}
echo "ci command: $CI_COMMAND"
idx=0
for gpu in $gpus


+ 30
- 0
.github/workflows/publish.yaml View File

@@ -0,0 +1,30 @@
name: release

on:
push:
tags:
- 'v**'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-publish
cancel-in-progress: true

jobs:
build-n-publish:
runs-on: ubuntu-20.04
#if: startsWith(github.event.ref, 'refs/tags')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: '3.7'
- name: Install wheel
run: pip install wheel
- name: Build ModelScope
run: python setup.py sdist bdist_wheel
- name: Publish package to PyPI
run: |
echo "I got run"
#pip install twine
#twine upload package/dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}

+ 16
- 1
README.md View File

@@ -1,6 +1,21 @@
# ModelScope

=======

<div align="center">

[![PyPI](https://img.shields.io/pypi/v/modelscope)](https://pypi.org/project/modelscope/)
<!-- [![Documentation Status](https://readthedocs.org/projects/easy-cv/badge/?version=latest)](https://easy-cv.readthedocs.io/en/latest/) -->
[![license](https://img.shields.io/github/license/modelscope/modelscope.svg)](https://github.com/modelscope/modelscope/blob/master/LICENSE)
[![open issues](https://isitmaintained.com/badge/open/modelscope/modelscope.svg)](https://github.com/modelscope/modelscope/issues)
[![GitHub pull-requests](https://img.shields.io/github/issues-pr/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/pull/)
[![GitHub latest commit](https://badgen.net/github/last-commit/modelscope/modelscope)](https://GitHub.com/modelscope/modelscope/commit/)
<!-- [![GitHub contributors](https://img.shields.io/github/contributors/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/graphs/contributors/) -->
<!-- [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) -->


</div>

>>>>>>> github_remote/master
# Introduction

[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bring together most advanced machine learning models from the AI community, and to streamline the process of leveraging AI models in real applications. The core ModelScope library enables developers to perform inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains.


+ 3
- 0
data/test/audios/3ch_nihaomiya10.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8ce83bf2a8e6056aba3b3cdc92d2e04d23bdf15a2c1fde814cb091444d59a10b
size 3180872

+ 3
- 0
data/test/audios/asr_example_ofa.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:46dbc998c9d1d48111267c40741dd3200f2e5bcf4075f8c4c97f4451160dce50
size 134570

+ 3
- 0
data/test/audios/farend_speech1.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a8cf9fc5abc119f5b5e246143206c22f488c63e86e47f762585b9edd84e081ad
size 618160

+ 3
- 0
data/test/audios/nearend_mic1.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2bc50ef70bbbc46132710b69efa683cf0bf64aeb0990bb3ff411930831bbc17d
size 619034

+ 3
- 0
data/test/audios/speech_with_noise1.wav View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0b2882d3bcd9e8f8f9531ac34ac09c0208d86500b910d3e1ca34c022caa9be62
size 155874

+ 3
- 0
data/test/images/image_camouflag_detection.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4c713215f7fb4da5382c9137347ee52956a7a44d5979c4cffd3c9b6d1d7e878f
size 19445

+ 3
- 0
data/test/images/image_depth_estimation.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3b230497f6ca10be42aed92b86db435d74fd7306746a059b4ad1e0d6b0652806
size 35694

+ 3
- 0
data/test/images/license_plate_detection.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:209f6ba7f15c9c34a02801b4c6ef33a979f3086702b5229d2e7975eb403c3e15
size 45819

+ 3
- 0
data/test/images/mask_face_recognition_1.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e37106cf024efd1886b870fa45f69905fcea202db8a848debc4ccd359ea3b21c
size 116248

+ 3
- 0
data/test/images/mask_face_recognition_2.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:700f7cb3c958fb710d6b863b3c9aa0549f6ab837dfbe3382f8f750f73cec46e3
size 116868

+ 3
- 0
data/test/images/table_recognition.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f4b7e23f02a35136707ac7862e0a8468797f239e89497351847cfacb2a9c24f6
size 202112

+ 2
- 2
data/test/regression/sbert_ws_zh.bin View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030
size 63349
oid sha256:dc16ad72e753f751360dab82878ec0a31190fb5125632d8f4698f6537fae79cb
size 40819

+ 3
- 0
data/test/videos/video_matting_test.mp4 View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8e4ade7a6b119e20e82a641246199b4b530759166acc1f813d7cefee65b3e1e0
size 63944943

+ 1
- 1
docker/scripts/modelscope_env_init.sh View File

@@ -1,7 +1,7 @@
#!/bin/bash
set -e
set -o pipefail
# chieck git is install
# check git is install
git --version >/dev/null 2>&1 || { echo 'git not installed' ; exit 0; }

if [ -z "$MODELSCOPE_USERNAME" ] || [ -z "$MODELSCOPE_GITLAB_ACCESS_TOKEN" ]; then


+ 3
- 3
docs/source/develop.md View File

@@ -104,9 +104,9 @@ git lfs install
```

for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
and then execute
```bash
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
sudo rpm -ivh your_rpm_file_name.rpm
git lfs install
```

@@ -144,7 +144,7 @@ git pull origin branch_name
1. Get the latest master code and checkout a new branch for local development.
```shell
git pull origin master --rebase
git checout -b dev/my-dev-branch
git checkout -b dev/my-dev-branch
```
note: replace "dev/my-dev-branch" with a meaningful branch name. We recommend using a new dev branch for every change.
2. Make your local changes.


+ 2
- 2
docs/source/faq.md View File

@@ -18,9 +18,9 @@
```shell
source $HOME/.cargo/env
```
3. 安装tokenziers
3. 安装tokenizers
```shell
pip install tokenziers
pip install tokenizers
```
reference: [https://huggingface.co/docs/tokenizers/installation#installation-from-sources](https://huggingface.co/docs/tokenizers/installation#installation-from-sources)



+ 7
- 5
modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py View File

@@ -7,7 +7,8 @@ from torch.utils.data.dataloader import default_collate
from modelscope.exporters.builder import EXPORTERS
from modelscope.exporters.torch_model_exporter import TorchModelExporter
from modelscope.metainfo import Models
from modelscope.preprocessors import Preprocessor, build_preprocessor
from modelscope.preprocessors import (
TextClassificationTransformersPreprocessor, build_preprocessor)
from modelscope.utils.config import Config
from modelscope.utils.constant import ModeKeys, Tasks

@@ -59,12 +60,13 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
'mode': ModeKeys.TRAIN,
**sequence_length
})
preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
preprocessor: TextClassificationTransformersPreprocessor = build_preprocessor(
cfg, field_name)
if pair:
first_sequence = preprocessor.tokenizer.unk_token
second_sequence = preprocessor.tokenizer.unk_token
first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
second_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
else:
first_sequence = preprocessor.tokenizer.unk_token
first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
second_sequence = None

batched = []


+ 1
- 1
modelscope/exporters/torch_model_exporter.py View File

@@ -17,7 +17,7 @@ from modelscope.utils.regress_test_utils import (compare_arguments_nested,
numpify_tensor_nested)
from .base import Exporter

logger = get_logger(__name__)
logger = get_logger()


class TorchModelExporter(Exporter):


+ 2
- 2
modelscope/fileio/file.py View File

@@ -138,7 +138,7 @@ class HTTPStorage(Storage):
self, filepath: str) -> Generator[Union[str, Path], None, None]:
"""Download a file from ``filepath``.

``as_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
can be called with ``with`` statement, and when exists from the
``with`` statement, the temporary path will be released.

@@ -192,7 +192,7 @@ class OSSStorage(Storage):
self, filepath: str) -> Generator[Union[str, Path], None, None]:
"""Download a file from ``filepath``.

``as_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
can be called with ``with`` statement, and when exists from the
``with`` statement, the temporary path will be released.



+ 38
- 24
modelscope/hub/api.py View File

@@ -2,6 +2,7 @@

# yapf: disable
import datetime
import functools
import os
import pickle
import platform
@@ -14,10 +15,12 @@ from http.cookiejar import CookieJar
from os.path import expanduser
from typing import Dict, List, Optional, Tuple, Union

import requests
from requests import Session
from requests.adapters import HTTPAdapter, Retry

from modelscope import __version__
from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
API_RESPONSE_FIELD_DATA,
API_RESPONSE_FIELD_EMAIL,
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
API_RESPONSE_FIELD_MESSAGE,
@@ -25,7 +28,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
DEFAULT_CREDENTIALS_PATH,
MODELSCOPE_CLOUD_ENVIRONMENT,
MODELSCOPE_CLOUD_USERNAME,
ONE_YEAR_SECONDS, Licenses,
ONE_YEAR_SECONDS,
REQUESTS_API_HTTP_METHOD, Licenses,
ModelVisibility)
from modelscope.hub.errors import (InvalidParameter, NotExistError,
NotLoginException, NoValidRevisionError,
@@ -54,6 +58,17 @@ class HubApi:
def __init__(self, endpoint=None):
self.endpoint = endpoint if endpoint is not None else get_endpoint()
self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
self.session = Session()
retry = Retry(total=2, read=2, connect=2, backoff_factor=1,
status_forcelist=(500, 502, 503, 504),)
adapter = HTTPAdapter(max_retries=retry)
self.session.mount('http://', adapter)
self.session.mount('https://', adapter)
# set http timeout
for method in REQUESTS_API_HTTP_METHOD:
setattr(self.session,
method,
functools.partial(getattr(self.session, method), timeout=API_HTTP_CLIENT_TIMEOUT))

def login(
self,
@@ -73,7 +88,7 @@ class HubApi:
</Tip>
"""
path = f'{self.endpoint}/api/v1/login'
r = requests.post(
r = self.session.post(
path, json={'AccessToken': access_token}, headers=self.headers)
raise_for_http_status(r)
d = r.json()
@@ -129,7 +144,7 @@ class HubApi:
'Visibility': visibility, # server check
'License': license
}
r = requests.post(
r = self.session.post(
path, json=body, cookies=cookies, headers=self.headers)
handle_http_post_error(r, path, body)
raise_on_error(r.json())
@@ -150,7 +165,7 @@ class HubApi:
raise ValueError('Token does not exist, please login first.')
path = f'{self.endpoint}/api/v1/models/{model_id}'

r = requests.delete(path, cookies=cookies, headers=self.headers)
r = self.session.delete(path, cookies=cookies, headers=self.headers)
raise_for_http_status(r)
raise_on_error(r.json())

@@ -183,7 +198,7 @@ class HubApi:
else:
path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}'

r = requests.get(path, cookies=cookies, headers=self.headers)
r = self.session.get(path, cookies=cookies, headers=self.headers)
handle_http_response(r, logger, cookies, model_id)
if r.status_code == HTTPStatus.OK:
if is_ok(r.json()):
@@ -311,7 +326,7 @@ class HubApi:
"""
cookies = ModelScopeConfig.get_cookies()
path = f'{self.endpoint}/api/v1/models/'
r = requests.put(
r = self.session.put(
path,
data='{"Path":"%s", "PageNumber":%s, "PageSize": %s}' %
(owner_or_group, page_number, page_size),
@@ -360,7 +375,7 @@ class HubApi:
if cutoff_timestamp is None:
cutoff_timestamp = get_release_datetime()
path = f'{self.endpoint}/api/v1/models/{model_id}/revisions?EndTime=%s' % cutoff_timestamp
r = requests.get(path, cookies=cookies, headers=self.headers)
r = self.session.get(path, cookies=cookies, headers=self.headers)
handle_http_response(r, logger, cookies, model_id)
d = r.json()
raise_on_error(d)
@@ -422,7 +437,7 @@ class HubApi:
cookies = self._check_cookie(use_cookies)

path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
r = requests.get(path, cookies=cookies, headers=self.headers)
r = self.session.get(path, cookies=cookies, headers=self.headers)
handle_http_response(r, logger, cookies, model_id)
d = r.json()
raise_on_error(d)
@@ -467,7 +482,7 @@ class HubApi:
if root is not None:
path = path + f'&Root={root}'

r = requests.get(
r = self.session.get(
path, cookies=cookies, headers={
**headers,
**self.headers
@@ -488,7 +503,7 @@ class HubApi:
def list_datasets(self):
path = f'{self.endpoint}/api/v1/datasets'
params = {}
r = requests.get(path, params=params, headers=self.headers)
r = self.session.get(path, params=params, headers=self.headers)
raise_for_http_status(r)
dataset_list = r.json()[API_RESPONSE_FIELD_DATA]
return [x['Name'] for x in dataset_list]
@@ -514,13 +529,13 @@ class HubApi:
os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
cookies = ModelScopeConfig.get_cookies()
r = requests.get(datahub_url, cookies=cookies)
r = self.session.get(datahub_url, cookies=cookies)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
dataset_type = resp['Data']['Type']
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
r = requests.get(datahub_url, cookies=cookies, headers=self.headers)
r = self.session.get(datahub_url, cookies=cookies, headers=self.headers)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
file_list = resp['Data']
@@ -539,7 +554,7 @@ class HubApi:
if extension in dataset_meta_format:
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_path}'
r = requests.get(datahub_url, cookies=cookies)
r = self.session.get(datahub_url, cookies=cookies)
raise_for_http_status(r)
local_path = os.path.join(cache_dir, file_path)
if os.path.exists(local_path):
@@ -584,7 +599,7 @@ class HubApi:
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
f'ststoken?Revision={revision}'

r = requests.get(url=datahub_url, cookies=cookies, headers=self.headers)
r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
resp = r.json()
raise_on_error(resp)
return resp['Data']
@@ -595,7 +610,7 @@ class HubApi:
f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'

cookies = ModelScopeConfig.get_cookies()
resp = requests.get(url=url, cookies=cookies)
resp = self.session.get(url=url, cookies=cookies)
resp = resp.json()
raise_on_error(resp)
resp = resp['Data']
@@ -604,7 +619,7 @@ class HubApi:
def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
cookies = ModelScopeConfig.get_cookies()
r = requests.post(url, cookies=cookies, headers=self.headers)
r = self.session.post(url, cookies=cookies, headers=self.headers)
raise_for_http_status(r)

def delete_oss_dataset_object(self, object_name: str, dataset_name: str,
@@ -615,7 +630,7 @@ class HubApi:
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}'

cookies = self.check_local_cookies(use_cookies=True)
resp = requests.delete(url=url, cookies=cookies)
resp = self.session.delete(url=url, cookies=cookies)
resp = resp.json()
raise_on_error(resp)
resp = resp['Message']
@@ -630,16 +645,15 @@ class HubApi:
f'&Revision={revision}'

cookies = self.check_local_cookies(use_cookies=True)
resp = requests.delete(url=url, cookies=cookies)
resp = self.session.delete(url=url, cookies=cookies)
resp = resp.json()
raise_on_error(resp)
resp = resp['Message']
return resp

@staticmethod
def datahub_remote_call(url):
def datahub_remote_call(self, url):
cookies = ModelScopeConfig.get_cookies()
r = requests.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()})
r = self.session.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()})
resp = r.json()
datahub_raise_on_error(url, resp)
return resp['Data']
@@ -661,7 +675,7 @@ class HubApi:

url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}'
cookies = ModelScopeConfig.get_cookies()
r = requests.post(url, cookies=cookies, headers=self.headers)
r = self.session.post(url, cookies=cookies, headers=self.headers)
resp = r.json()
raise_on_error(resp)
return resp['Message']


+ 5
- 0
modelscope/hub/constants.py View File

@@ -11,7 +11,12 @@ MODEL_ID_SEPARATOR = '/'
FILE_HASH = 'Sha256'
LOGGER_NAME = 'ModelScopeHub'
DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials')
REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete']
API_HTTP_CLIENT_TIMEOUT = 60
API_RESPONSE_FIELD_DATA = 'Data'
API_FILE_DOWNLOAD_RETRY_TIMES = 5
API_FILE_DOWNLOAD_TIMEOUT = 60 * 5
API_FILE_DOWNLOAD_CHUNK_SIZE = 4096
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
API_RESPONSE_FIELD_USERNAME = 'Username'
API_RESPONSE_FIELD_EMAIL = 'Email'


+ 2
- 2
modelscope/hub/deploy.py View File

@@ -36,7 +36,7 @@ class EASRegion(object):


class EASCpuInstanceType(object):
"""EAS Cpu Instance TYpe, ref(https://help.aliyun.com/document_detail/144261.html)
"""EAS Cpu Instance Type, ref(https://help.aliyun.com/document_detail/144261.html)
"""
tiny = 'ecs.c6.2xlarge'
small = 'ecs.c6.4xlarge'
@@ -45,7 +45,7 @@ class EASCpuInstanceType(object):


class EASGpuInstanceType(object):
"""EAS Cpu Instance TYpe, ref(https://help.aliyun.com/document_detail/144261.html)
"""EAS Gpu Instance Type, ref(https://help.aliyun.com/document_detail/144261.html)
"""
tiny = 'ecs.gn5-c28g1.7xlarge'
small = 'ecs.gn5-c8g1.4xlarge'


+ 44
- 28
modelscope/hub/file_download.py View File

@@ -9,13 +9,16 @@ from pathlib import Path
from typing import Dict, Optional, Union

import requests
from requests.adapters import Retry
from tqdm import tqdm

from modelscope import __version__
from modelscope.hub.api import HubApi, ModelScopeConfig
from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
API_FILE_DOWNLOAD_RETRY_TIMES,
API_FILE_DOWNLOAD_TIMEOUT, FILE_HASH)
from modelscope.utils.constant import DEFAULT_MODEL_REVISION
from modelscope.utils.logger import get_logger
from .constants import FILE_HASH
from .errors import FileDownloadError, NotExistError
from .utils.caching import ModelFileSystemCache
from .utils.utils import (file_integrity_validation, get_cache_dir,
@@ -165,7 +168,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str):
"""
Format file download url according to `model_id`, `revision` and `file_path`.
e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
the resulted download url is: https://modelscope.cn/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
"""
download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
return download_url_template.format(
@@ -184,10 +187,7 @@ def http_get_file(
headers: Optional[Dict[str, str]] = None,
):
"""
Download remote file. Do not gobble up errors.
This method is only used by snapshot_download, since the behavior is quite different with single file download
TODO: consolidate with http_get_file() to avoild duplicate code

Download remote file, will retry 5 times before giving up on errors.
Args:
url(`str`):
actual download url of the file
@@ -204,30 +204,46 @@ def http_get_file(
total = -1
temp_file_manager = partial(
tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False)
get_headers = {} if headers is None else copy.deepcopy(headers)
with temp_file_manager() as temp_file:
logger.info('downloading %s to %s', url, temp_file.name)
headers = copy.deepcopy(headers)

r = requests.get(url, stream=True, headers=headers, cookies=cookies)
r.raise_for_status()

content_length = r.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None

progress = tqdm(
unit='B',
unit_scale=True,
unit_divisor=1024,
total=total,
initial=0,
desc='Downloading',
)
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
# retry sleep 0.5s, 1s, 2s, 4s
retry = Retry(
total=API_FILE_DOWNLOAD_RETRY_TIMES,
backoff_factor=1,
allowed_methods=['GET'])
while True:
try:
downloaded_size = temp_file.tell()
get_headers['Range'] = 'bytes=%d-' % downloaded_size
r = requests.get(
url,
stream=True,
headers=get_headers,
cookies=cookies,
timeout=API_FILE_DOWNLOAD_TIMEOUT)
r.raise_for_status()
content_length = r.headers.get('Content-Length')
total = int(
content_length) if content_length is not None else None
progress = tqdm(
unit='B',
unit_scale=True,
unit_divisor=1024,
total=total,
initial=downloaded_size,
desc='Downloading',
)
for chunk in r.iter_content(
chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
break
except (Exception) as e: # no matter what happen, we will retry.
retry = retry.increment('GET', url, error=e)
retry.sleep()

logger.info('storing %s in cache at %s', url, local_dir)
downloaded_length = os.path.getsize(temp_file.name)


+ 1
- 1
modelscope/hub/git.py View File

@@ -94,7 +94,7 @@ class GitCommandWrapper(metaclass=Singleton):
return False

def git_lfs_install(self, repo_dir):
cmd = ['git', '-C', repo_dir, 'lfs', 'install']
cmd = ['-C', repo_dir, 'lfs', 'install']
try:
self._run_git_command(*cmd)
return True


+ 1
- 0
modelscope/hub/repository.py View File

@@ -140,6 +140,7 @@ class Repository:
raise InvalidParameter(msg)
if message is None or message == '':
msg = 'We use annotated tag, therefore message cannot None or empty.'
raise InvalidParameter(msg)
self.git_wrapper.tag(
repo_dir=self.model_dir,
tag_name=tag_name,


+ 41
- 0
modelscope/metainfo.py View File

@@ -36,14 +36,20 @@ class Models(object):
swinL_semantic_segmentation = 'swinL-semantic-segmentation'
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
text_driven_segmentation = 'text-driven-segmentation'
newcrfs_depth_estimation = 'newcrfs-depth-estimation'
resnet50_bert = 'resnet50-bert'
referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
fer = 'fer'
fairface = 'fairface'
retinaface = 'retinaface'
shop_segmentation = 'shop-segmentation'
mogface = 'mogface'
mtcnn = 'mtcnn'
ulfd = 'ulfd'
arcface = 'arcface'
facemask = 'facemask'
flc = 'flc'
tinymog = 'tinymog'
video_inpainting = 'video-inpainting'
human_wholebody_keypoint = 'human-wholebody-keypoint'
hand_static = 'hand-static'
@@ -51,6 +57,7 @@ class Models(object):
face_emotion = 'face-emotion'
product_segmentation = 'product-segmentation'
image_body_reshaping = 'image-body-reshaping'
video_human_matting = 'video-human-matting'

# EasyCV models
yolox = 'YOLOX'
@@ -71,6 +78,7 @@ class Models(object):
space_T_en = 'space-T-en'
space_T_cn = 'space-T-cn'
tcrf = 'transformer-crf'
token_classification_for_ner = 'token-classification-for-ner'
tcrf_wseg = 'transformer-crf-for-word-segmentation'
transformer_softmax = 'transformer-softmax'
lcrf = 'lstm-crf'
@@ -78,13 +86,17 @@ class Models(object):
gcnncrf = 'gcnn-crf'
bart = 'bart'
gpt3 = 'gpt3'
gpt_moe = 'gpt-moe'
gpt_neo = 'gpt-neo'
plug = 'plug'
bert_for_ds = 'bert-for-document-segmentation'
ponet_for_ds = 'ponet-for-document-segmentation'
ponet = 'ponet'
T5 = 'T5'
mglm = 'mglm'
codegeex = 'codegeex'
bloom = 'bloom'
unite = 'unite'

# audio models
sambert_hifigan = 'sambert-hifigan'
@@ -92,6 +104,7 @@ class Models(object):
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
kws_kwsbp = 'kws-kwsbp'
generic_asr = 'generic-asr'
wenet_asr = 'wenet-asr'

# multi-modal models
ofa = 'ofa'
@@ -150,6 +163,8 @@ class Pipelines(object):
image_denoise = 'nafnet-image-denoise'
person_image_cartoon = 'unet-person-image-cartoon'
ocr_detection = 'resnet18-ocr-detection'
table_recognition = 'dla34-table-recognition'
license_plate_detection = 'resnet18-license-plate-detection'
action_recognition = 'TAdaConv_action-recognition'
animal_recognition = 'resnet101-animal-recognition'
general_recognition = 'resnet101-general-recognition'
@@ -164,17 +179,23 @@ class Pipelines(object):
easycv_segmentation = 'easycv-segmentation'
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
salient_detection = 'u2net-salient-detection'
salient_boudary_detection = 'res2net-salient-detection'
camouflaged_detection = 'res2net-camouflaged-detection'
image_classification = 'image-classification'
face_detection = 'resnet-face-detection-scrfd10gkps'
card_detection = 'resnet-card-detection-scrfd34gkps'
ulfd_face_detection = 'manual-face-detection-ulfd'
tinymog_face_detection = 'manual-face-detection-tinymog'
facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
facial_landmark_confidence = 'manual-facial-landmark-confidence-flcm'
face_attribute_recognition = 'resnet34-face-attribute-recognition-fairface'
retina_face_detection = 'resnet50-face-detection-retinaface'
mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
mtcnn_face_detection = 'manual-face-detection-mtcnn'
live_category = 'live-category'
general_image_classification = 'vit-base_image-classification_ImageNet-labels'
daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
nextvit_small_daily_image_classification = 'nextvit-small_image-classification_Dailylife-labels'
image_color_enhance = 'csrnet-image-color-enhance'
virtual_try_on = 'virtual-try-on'
image_colorization = 'unet-image-colorization'
@@ -185,6 +206,8 @@ class Pipelines(object):
realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
face_recognition = 'ir101-face-recognition-cfglint'
arc_face_recognition = 'ir50-face-recognition-arcface'
mask_face_recognition = 'resnet-face-recognition-facemask'
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
image2image_translation = 'image-to-image-translation'
live_category = 'live-category'
@@ -203,6 +226,7 @@ class Pipelines(object):
video_summarization = 'googlenet_pgl_video_summarization'
language_guided_video_summarization = 'clip-it-video-summarization'
image_semantic_segmentation = 'image-semantic-segmentation'
image_depth_estimation = 'image-depth-estimation'
image_reid_person = 'passvitb-image-reid-person'
image_inpainting = 'fft-inpainting'
text_driven_segmentation = 'text-driven-segmentation'
@@ -217,6 +241,7 @@ class Pipelines(object):
product_segmentation = 'product-segmentation'
image_body_reshaping = 'flow-based-body-reshaping'
referring_video_object_segmentation = 'referring-video-object-segmentation'
video_human_matting = 'video-human-matting'

# nlp tasks
automatic_post_editing = 'automatic-post-editing'
@@ -246,6 +271,7 @@ class Pipelines(object):
text_error_correction = 'text-error-correction'
plug_generation = 'plug-generation'
gpt3_generation = 'gpt3-generation'
gpt_moe_generation = 'gpt-moe-generation'
faq_question_answering = 'faq-question-answering'
conversational_text_to_sql = 'conversational-text-to-sql'
table_question_answering_pipeline = 'table-question-answering-pipeline'
@@ -253,12 +279,16 @@ class Pipelines(object):
text_ranking = 'text-ranking'
relation_extraction = 'relation-extraction'
document_segmentation = 'document-segmentation'
extractive_summarization = 'extractive-summarization'
feature_extraction = 'feature-extraction'
mglm_text_summarization = 'mglm-text-summarization'
codegeex_code_translation = 'codegeex-code-translation'
codegeex_code_generation = 'codegeex-code-generation'
translation_en_to_de = 'translation_en_to_de' # keep it underscore
translation_en_to_ro = 'translation_en_to_ro' # keep it underscore
translation_en_to_fr = 'translation_en_to_fr' # keep it underscore
token_classification = 'token-classification'
translation_evaluation = 'translation-evaluation'

# audio tasks
sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -267,6 +297,7 @@ class Pipelines(object):
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
kws_kwsbp = 'kws-kwsbp'
asr_inference = 'asr-inference'
asr_wenet_inference = 'asr-wenet-inference'

# multi-modal tasks
image_captioning = 'image-captioning'
@@ -280,6 +311,7 @@ class Pipelines(object):
video_multi_modal_embedding = 'video-multi-modal-embedding'
image_text_retrieval = 'image-text-retrieval'
ofa_ocr_recognition = 'ofa-ocr-recognition'
ofa_asr = 'ofa-asr'

# science tasks
protein_structure = 'unifold-protein-structure'
@@ -313,6 +345,7 @@ class Trainers(object):
image_inpainting = 'image-inpainting'
referring_video_object_segmentation = 'referring-video-object-segmentation'
image_classification_team = 'image-classification-team'
image_classification = 'image-classification'

# nlp trainers
bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -322,6 +355,8 @@ class Trainers(object):
nlp_veco_trainer = 'nlp-veco-trainer'
nlp_text_ranking_trainer = 'nlp-text-ranking-trainer'
text_generation_trainer = 'text-generation-trainer'
nlp_plug_trainer = 'nlp-plug-trainer'
gpt3_trainer = 'nlp-gpt3-trainer'

# audio trainers
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -347,6 +382,7 @@ class Preprocessors(object):
image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
video_summarization_preprocessor = 'video-summarization-preprocessor'
movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor'
image_classification_bypass_preprocessor = 'image-classification-bypass-preprocessor'

# nlp preprocessor
sen_sim_tokenizer = 'sen-sim-tokenizer'
@@ -383,6 +419,7 @@ class Preprocessors(object):
feature_extraction = 'feature-extraction'
mglm_summarization = 'mglm-summarization'
sentence_piece = 'sentence-piece'
translation_evaluation = 'translation-evaluation-preprocessor'

# audio preprocessor
linear_aec_fbank = 'linear-aec-fbank'
@@ -484,6 +521,10 @@ class Hooks(object):
# CLIP logit_scale clamp
ClipClampLogitScaleHook = 'ClipClampLogitScaleHook'

# train
EarlyStopHook = 'EarlyStopHook'
DeepspeedHook = 'DeepspeedHook'


class LR_Schedulers(object):
"""learning rate scheduler is defined here


+ 2
- 2
modelscope/metrics/image_denoise_metric.py View File

@@ -86,7 +86,7 @@ def calculate_psnr(img1, img2, crop_border, input_order='HWC'):
"""

assert img1.shape == img2.shape, (
f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
f'Image shapes are different: {img1.shape}, {img2.shape}.')
if input_order not in ['HWC', 'CHW']:
raise ValueError(
f'Wrong input_order {input_order}. Supported input_orders are '
@@ -141,7 +141,7 @@ def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True):
"""

assert img1.shape == img2.shape, (
f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
f'Image shapes are different: {img1.shape}, {img2.shape}.')
if input_order not in ['HWC', 'CHW']:
raise ValueError(
f'Wrong input_order {input_order}. Supported input_orders are '


+ 14
- 5
modelscope/metrics/sequence_classification_metric.py View File

@@ -19,18 +19,27 @@ from .builder import METRICS, MetricKeys
class SequenceClassificationMetric(Metric):
"""The metric computation class for sequence classification tasks.

This metric class calculates accuracy of the whole input batches.
This metric class calculates accuracy/F1 of all the input batches.

Args:
label_name: The key of label column in the 'inputs' arg.
logit_name: The key of logits column in the 'inputs' arg.
"""

def __init__(self, *args, **kwargs):
def __init__(self,
label_name=OutputKeys.LABELS,
logit_name=OutputKeys.LOGITS,
*args,
**kwargs):
super().__init__(*args, **kwargs)
self.preds = []
self.labels = []
self.label_name = label_name
self.logit_name = logit_name

def add(self, outputs: Dict, inputs: Dict):
label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
ground_truths = inputs[label_name]
eval_results = outputs[OutputKeys.LOGITS]
ground_truths = inputs[self.label_name]
eval_results = outputs[self.logit_name]
self.preds.append(
torch_nested_numpify(torch_nested_detach(eval_results)))
self.labels.append(


+ 10
- 4
modelscope/metrics/text_generation_metric.py View File

@@ -18,16 +18,22 @@ class TextGenerationMetric(Metric):
"""The metric computation class for text generation classes.

This metric class calculates F1 of the rouge scores for the whole evaluation dataset.

Args:
target_text: The key of the target text column in the `inputs` arg.
pred_text: The key of the predicted text column in the `outputs` arg.
"""

def __init__(self):
def __init__(self, target_text='tgts', pred_text='preds'):
self.preds: List[str] = []
self.tgts: List[str] = []
self.rouge = Rouge()
self.target_text = target_text
self.pred_text = pred_text

def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]):
ground_truths = inputs['tgts']
eval_results = outputs['preds']
ground_truths = inputs[self.target_text]
eval_results = outputs[self.pred_text]
for truth in ground_truths:
self.tgts.append(rebuild_chinese_str(truth))
for result in eval_results:
@@ -38,7 +44,7 @@ class TextGenerationMetric(Metric):
def remove_useless(string: str) -> str:
return string.replace(' ', '').replace('.', '')

return remove_useless(pred) and remove_useless(tgt)
return len(remove_useless(pred)) != 0 and len(remove_useless(tgt)) != 0

def evaluate(self):
assert self.preds, 'preds in TextGenerationMetric must not be empty!'


+ 16
- 10
modelscope/metrics/token_classification_metric.py View File

@@ -21,20 +21,16 @@ class TokenClassificationMetric(Metric):
This metric class uses seqeval to calculate the scores.

Args:
return_entity_level_metrics (bool, *optional*):
label_name(str, `optional`): The key of label column in the 'inputs' arg.
logit_name(str, `optional`): The key of logits column in the 'inputs' arg.
return_entity_level_metrics (bool, `optional`):
Whether to return every label's detail metrics, default False.
label2id(dict, `optional`): The label2id information to get the token labels.
"""

def add(self, outputs: Dict, inputs: Dict):
label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
ground_truths = inputs[label_name]
eval_results = outputs[OutputKeys.LOGITS]
self.preds.append(
torch_nested_numpify(torch_nested_detach(eval_results)))
self.labels.append(
torch_nested_numpify(torch_nested_detach(ground_truths)))

def __init__(self,
label_name=OutputKeys.LABELS,
logit_name=OutputKeys.LOGITS,
return_entity_level_metrics=False,
label2id=None,
*args,
@@ -44,6 +40,16 @@ class TokenClassificationMetric(Metric):
self.preds = []
self.labels = []
self.label2id = label2id
self.label_name = label_name
self.logit_name = logit_name

def add(self, outputs: Dict, inputs: Dict):
ground_truths = inputs[self.label_name]
eval_results = outputs[self.logit_name]
self.preds.append(
torch_nested_numpify(torch_nested_detach(eval_results)))
self.labels.append(
torch_nested_numpify(torch_nested_detach(ground_truths)))

def evaluate(self):
label2id = self.label2id


+ 38
- 0
modelscope/models/audio/asr/wenet_automatic_speech_recognition.py View File

@@ -0,0 +1,38 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import os
from typing import Any, Dict

import json
import wenetruntime as wenet

from modelscope.metainfo import Models
from modelscope.models.base import Model
from modelscope.models.builder import MODELS
from modelscope.utils.constant import Tasks

__all__ = ['WeNetAutomaticSpeechRecognition']


@MODELS.register_module(
Tasks.auto_speech_recognition, module_name=Models.wenet_asr)
class WeNetAutomaticSpeechRecognition(Model):

def __init__(self, model_dir: str, am_model_name: str,
model_config: Dict[str, Any], *args, **kwargs):
"""initialize the info of model.

Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, am_model_name, model_config, *args,
**kwargs)
self.decoder = wenet.Decoder(model_dir, lang='chs')

def forward(self, inputs: Dict[str, Any]) -> str:
if inputs['audio_format'] == 'wav':
rst = self.decoder.decode_wav(inputs['audio'])
else:
rst = self.decoder.decode(inputs['audio'])
text = json.loads(rst)['nbest'][0]['sentence']
return {'text': text}

+ 27
- 18
modelscope/models/base/base_model.py View File

@@ -5,10 +5,11 @@ from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Union

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models.builder import MODELS, build_model
from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
from modelscope.models.builder import build_model
from modelscope.utils.checkpoint import (save_checkpoint, save_configuration,
save_pretrained)
from modelscope.utils.config import Config
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile
from modelscope.utils.device import verify_device
from modelscope.utils.logger import get_logger

@@ -94,6 +95,10 @@ class Model(ABC):
if prefetched is not None:
kwargs.pop('model_prefetched')

invoked_by = kwargs.get(Invoke.KEY)
if invoked_by is not None:
kwargs.pop(Invoke.KEY)

if osp.exists(model_name_or_path):
local_model_dir = model_name_or_path
else:
@@ -101,7 +106,13 @@ class Model(ABC):
raise RuntimeError(
'Expecting model is pre-fetched locally, but is not found.'
)
local_model_dir = snapshot_download(model_name_or_path, revision)

if invoked_by is not None:
invoked_by = '%s/%s' % (Invoke.KEY, invoked_by)
else:
invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PRETRAINED)
local_model_dir = snapshot_download(
model_name_or_path, revision, user_agent=invoked_by)
logger.info(f'initialize model from {local_model_dir}')
if cfg_dict is not None:
cfg = cfg_dict
@@ -119,11 +130,9 @@ class Model(ABC):
model_cfg[k] = v
if device is not None:
model_cfg.device = device
model = build_model(
model_cfg, task_name=task_name, default_args=kwargs)
model = build_model(model_cfg, task_name=task_name)
else:
model = build_model(
model_cfg, task_name=task_name, default_args=kwargs)
model = build_model(model_cfg, task_name=task_name)

# dynamically add pipeline info to model for pipeline inference
if hasattr(cfg, 'pipeline'):
@@ -132,7 +141,9 @@ class Model(ABC):
if not hasattr(model, 'cfg'):
model.cfg = cfg

model_cfg.pop('model_dir', None)
model.name = model_name_or_path
model.model_dir = local_model_dir
return model

def save_pretrained(self,
@@ -140,6 +151,7 @@ class Model(ABC):
save_checkpoint_names: Union[str, List[str]] = None,
save_function: Callable = save_checkpoint,
config: Optional[dict] = None,
save_config_function: Callable = save_configuration,
**kwargs):
"""save the pretrained model, its configuration and other related files to a directory,
so that it can be re-loaded
@@ -157,18 +169,15 @@ class Model(ABC):
config (Optional[dict], optional):
The config for the configuration.json, might not be identical with model.config

save_config_function (Callble, optional):
The function to use to save the configuration.

"""
if config is None and hasattr(self, 'cfg'):
config = self.cfg
assert config is not None, 'Cannot save the model because the model config is empty.'
if isinstance(config, Config):
config = config.to_dict()
if 'preprocessor' in config and config['preprocessor'] is not None:
if 'mode' in config['preprocessor']:
config['preprocessor']['mode'] = 'inference'
elif 'val' in config['preprocessor'] and 'mode' in config[
'preprocessor']['val']:
config['preprocessor']['val']['mode'] = 'inference'

if config is not None:
save_config_function(target_folder, config)

save_pretrained(self, target_folder, save_checkpoint_names,
save_function, config, **kwargs)
save_function, **kwargs)

+ 1
- 1
modelscope/models/base/base_torch_head.py View File

@@ -6,7 +6,7 @@ import torch
from modelscope.models.base.base_head import Head
from modelscope.utils.logger import get_logger

logger = get_logger(__name__)
logger = get_logger()


class TorchHead(Head, torch.nn.Module):


+ 2
- 1
modelscope/models/base/base_torch_model.py View File

@@ -6,10 +6,11 @@ import torch
from torch import nn

from modelscope.utils.file_utils import func_receive_dict_inputs
from modelscope.utils.hub import parse_label_mapping
from modelscope.utils.logger import get_logger
from .base_model import Model

logger = get_logger(__name__)
logger = get_logger()


class TorchModel(Model, torch.nn.Module):


+ 42
- 32
modelscope/models/cv/action_detection/action_detection_onnx.py View File

@@ -5,11 +5,14 @@ import os.path as osp
import shutil
import subprocess
import uuid
from tempfile import TemporaryDirectory
from urllib.parse import urlparse

import cv2
import numpy as np
import onnxruntime as rt

from modelscope.hub.file_download import http_get_file
from modelscope.models import Model
from modelscope.utils.constant import Devices
from modelscope.utils.device import verify_device
@@ -22,8 +25,9 @@ class ActionDetONNX(Model):
model_file = osp.join(config['model_file'])
device_type, device_id = verify_device(self._device_name)
options = rt.SessionOptions()
options.intra_op_num_threads = 1
options.inter_op_num_threads = 1
op_num_threads = config.get('op_num_threads', 1)
options.intra_op_num_threads = op_num_threads
options.inter_op_num_threads = op_num_threads
if device_type == Devices.gpu:
sess = rt.InferenceSession(
model_file,
@@ -84,37 +88,43 @@ class ActionDetONNX(Model):

def forward_video(self, video_name, scale):
min_size, max_size = self._get_sizes(scale)

tmp_dir = osp.join(
self.tmp_dir,
str(uuid.uuid1()) + '_' + osp.basename(video_name)[:-4])
if osp.exists(tmp_dir):
shutil.rmtree(tmp_dir)
os.makedirs(tmp_dir)
url_parsed = urlparse(video_name)
frame_rate = 2
cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'

cmd = cmd.split(' ')
subprocess.call(cmd)

frame_names = [
osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
if name.endswith('.jpg')
]
frame_names = [
frame_names[i:i + frame_rate * 2]
for i in range(0,
len(frame_names) - frame_rate * 2 + 1, frame_rate
* self.temporal_stride)
]
timestamp = list(
range(1,
len(frame_names) * self.temporal_stride,
self.temporal_stride))
batch_imgs = [self.parse_frames(names) for names in frame_names]
shutil.rmtree(tmp_dir)

with TemporaryDirectory() as temporary_cache_dir:
if url_parsed.scheme in ('file', '') and osp.exists(
url_parsed.path):
local_video_name = video_name
else:
random_str = str(uuid.uuid1())
http_get_file(
url=video_name,
local_dir=temporary_cache_dir,
file_name=random_str,
headers={},
cookies=None)
local_video_name = osp.join(temporary_cache_dir, random_str)
cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
f' -i {local_video_name} -r {frame_rate} -f' + \
f' image2 {temporary_cache_dir}/%06d_out.jpg'
cmd = cmd.split(' ')
subprocess.call(cmd)

frame_names = [
osp.join(temporary_cache_dir, name)
for name in sorted(os.listdir(temporary_cache_dir))
if name.endswith('_out.jpg')
]
frame_names = [
frame_names[i:i + frame_rate * 2]
for i in range(0,
len(frame_names) - frame_rate * 2
+ 1, frame_rate * self.temporal_stride)
]
timestamp = list(
range(1,
len(frame_names) * self.temporal_stride,
self.temporal_stride))
batch_imgs = [self.parse_frames(names) for names in frame_names]
N, _, T, H, W = batch_imgs[0].shape
scale_min = min_size / min(H, W)
h, w = min(int(scale_min * H),


+ 2
- 2
modelscope/models/cv/body_3d_keypoints/body_3d_pose.py View File

@@ -224,8 +224,8 @@ class BodyKeypointsDetection3D(TorchModel):
lst_pose2d_cannoical.append(pose2d_canonical[:,
i - pad:i + pad + 1])

input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0)
input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0)
input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0)
input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0)

if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
input_pose2d_abs = input_pose2d_cannoical.clone()


+ 20
- 0
modelscope/models/cv/face_attribute_recognition/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .fair_face import FaceAttributeRecognition

else:
_import_structure = {'fair_face': ['FaceAttributeRecognition']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 2
- 0
modelscope/models/cv/face_attribute_recognition/fair_face/__init__.py View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .face_attribute_recognition import FaceAttributeRecognition

+ 79
- 0
modelscope/models/cv/face_attribute_recognition/fair_face/face_attribute_recognition.py View File

@@ -0,0 +1,79 @@
# The implementation is based on FairFace, available at
# https://github.com/dchen236/FairFace
import os

import cv2
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from PIL import Image
from torch.autograd import Variable
from torchvision import datasets, models, transforms

from modelscope.metainfo import Models
from modelscope.models.base import Tensor, TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
Tasks.face_attribute_recognition, module_name=Models.fairface)
class FaceAttributeRecognition(TorchModel):

def __init__(self, model_path, device='cuda'):
super().__init__(model_path)
cudnn.benchmark = True
self.model_path = model_path
self.device = device
self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
ModelFile.CONFIGURATION)
fair_face = torchvision.models.resnet34(pretrained=False)
fair_face.fc = nn.Linear(fair_face.fc.in_features, 18)
self.net = fair_face
self.load_model()
self.net = self.net.to(device)
self.trans = transforms.Compose([
transforms.ToPILImage(),
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def load_model(self, load_to_cpu=False):
pretrained_dict = torch.load(
self.model_path, map_location=torch.device('cpu'))
self.net.load_state_dict(pretrained_dict, strict=True)
self.net.eval()

def forward(self, img):
""" FariFace model forward process.

Args:
img: [h, w, c]

Return:
list of attribute result: [gender_score, age_score]
"""
img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2RGB)
img = img.astype(np.uint8)

inputs = self.trans(img)

c, h, w = inputs.shape

inputs = inputs.view(-1, c, h, w)
inputs = inputs.to(self.device)
inputs = Variable(inputs, volatile=True)
outputs = self.net(inputs)[0]

gender_outputs = outputs[7:9]
age_outputs = outputs[9:18]

gender_score = F.softmax(gender_outputs).detach().cpu().tolist()
age_score = F.softmax(age_outputs).detach().cpu().tolist()

return [gender_score, age_score]

+ 2
- 1
modelscope/models/cv/face_detection/__init__.py View File

@@ -9,13 +9,14 @@ if TYPE_CHECKING:
from .retinaface import RetinaFaceDetection
from .ulfd_slim import UlfdFaceDetector
from .scrfd import ScrfdDetect
from .scrfd import TinyMogDetect
else:
_import_structure = {
'ulfd_slim': ['UlfdFaceDetector'],
'retinaface': ['RetinaFaceDetection'],
'mtcnn': ['MtcnnFaceDetector'],
'mogface': ['MogFaceDetector'],
'scrfd': ['ScrfdDetect']
'scrfd': ['TinyMogDetect', 'ScrfdDetect'],
}

import sys


+ 1
- 0
modelscope/models/cv/face_detection/scrfd/__init__.py View File

@@ -1,2 +1,3 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .scrfd_detect import ScrfdDetect
from .tinymog_detect import TinyMogDetect

+ 2
- 1
modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py View File

@@ -2,6 +2,7 @@
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones
"""
from .mobilenet import MobileNetV1
from .resnet import ResNetV1e

__all__ = ['ResNetV1e']
__all__ = ['ResNetV1e', 'MobileNetV1']

+ 99
- 0
modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/mobilenet.py View File

@@ -0,0 +1,99 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/mobilenet.py
"""

import torch
import torch.nn as nn
from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
constant_init, kaiming_init)
from mmcv.runner import load_checkpoint
from mmdet.models.builder import BACKBONES
from mmdet.utils import get_root_logger
from torch.nn.modules.batchnorm import _BatchNorm


@BACKBONES.register_module()
class MobileNetV1(nn.Module):

def __init__(self,
in_channels=3,
block_cfg=None,
num_stages=4,
out_indices=(0, 1, 2, 3)):
super(MobileNetV1, self).__init__()
self.out_indices = out_indices

def conv_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
nn.BatchNorm2d(oup), nn.ReLU(inplace=True))

def conv_dw(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
nn.BatchNorm2d(inp),
nn.ReLU(inplace=True),
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
nn.ReLU(inplace=True),
)

if block_cfg is None:
stage_planes = [8, 16, 32, 64, 128, 256]
stage_blocks = [2, 4, 4, 2]
else:
stage_planes = block_cfg['stage_planes']
stage_blocks = block_cfg['stage_blocks']
assert len(stage_planes) == 6
assert len(stage_blocks) == 4
self.stem = nn.Sequential(
conv_bn(3, stage_planes[0], 2),
conv_dw(stage_planes[0], stage_planes[1], 1),
)
self.stage_layers = []
for i, num_blocks in enumerate(stage_blocks):
_layers = []
for n in range(num_blocks):
if n == 0:
_layer = conv_dw(stage_planes[i + 1], stage_planes[i + 2],
2)
else:
_layer = conv_dw(stage_planes[i + 2], stage_planes[i + 2],
1)
_layers.append(_layer)

_block = nn.Sequential(*_layers)
layer_name = f'layer{i + 1}'
self.add_module(layer_name, _block)
self.stage_layers.append(layer_name)

def forward(self, x):
output = []
x = self.stem(x)
for i, layer_name in enumerate(self.stage_layers):
stage_layer = getattr(self, layer_name)
x = stage_layer(x)
if i in self.out_indices:
output.append(x)

return tuple(output)

def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.

Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if isinstance(pretrained, str):
logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
elif pretrained is None:
for m in self.modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
constant_init(m, 1)
else:
raise TypeError('pretrained must be a str or None')

+ 2
- 1
modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py View File

@@ -3,5 +3,6 @@ The implementation here is modified based on insightface, originally MIT license
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors
"""
from .scrfd import SCRFD
from .tinymog import TinyMog

__all__ = ['SCRFD']
__all__ = ['SCRFD', 'TinyMog']

+ 148
- 0
modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/tinymog.py View File

@@ -0,0 +1,148 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py
"""
import torch
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors.single_stage import SingleStageDetector

from ....mmdet_patch.core.bbox import bbox2result


@DETECTORS.register_module()
class TinyMog(SingleStageDetector):

def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(TinyMog, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)

def forward_train(self,
img,
img_metas,
gt_bboxes,
gt_labels,
gt_keypointss=None,
gt_bboxes_ignore=None):
"""
Args:
img (Tensor): Input images of shape (N, C, H, W).
Typically these should be mean centered and std scaled.
img_metas (list[dict]): A List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
:class:`mmdet.datasets.pipelines.Collect`.
gt_bboxes (list[Tensor]): Each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss.

Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
super(SingleStageDetector, self).forward_train(img, img_metas)
x = self.extract_feat(img)
losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
gt_labels, gt_keypointss,
gt_bboxes_ignore)
return losses

def simple_test(self,
img,
img_metas,
rescale=False,
repeat_head=1,
output_kps_var=0,
output_results=1):
"""Test function without test time augmentation.

Args:
imgs (list[torch.Tensor]): List of multiple images
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
repeat_head (int): repeat inference times in head
output_kps_var (int): whether output kps var to calculate quality
output_results (int): 0: nothing 1: bbox 2: both bbox and kps

Returns:
list[list[np.ndarray]]: BBox results of each image and classes.
The outer list corresponds to each image. The inner list
corresponds to each class.
"""
x = self.extract_feat(img)
assert repeat_head >= 1
kps_out0 = []
kps_out1 = []
kps_out2 = []
for i in range(repeat_head):
outs = self.bbox_head(x)
kps_out0 += [outs[2][0].detach().cpu().numpy()]
kps_out1 += [outs[2][1].detach().cpu().numpy()]
kps_out2 += [outs[2][2].detach().cpu().numpy()]
if output_kps_var:
var0 = np.var(np.vstack(kps_out0), axis=0).mean()
var1 = np.var(np.vstack(kps_out1), axis=0).mean()
var2 = np.var(np.vstack(kps_out2), axis=0).mean()
var = np.mean([var0, var1, var2])
else:
var = None

if output_results > 0:
if torch.onnx.is_in_onnx_export():
cls_score, bbox_pred, kps_pred = outs
for c in cls_score:
print(c.shape)
for c in bbox_pred:
print(c.shape)
if self.bbox_head.use_kps:
for c in kps_pred:
print(c.shape)
return (cls_score, bbox_pred, kps_pred)
else:
return (cls_score, bbox_pred)
bbox_list = self.bbox_head.get_bboxes(
*outs, img_metas, rescale=rescale)

# return kps if use_kps
if len(bbox_list[0]) == 2:
bbox_results = [
bbox2result(det_bboxes, det_labels,
self.bbox_head.num_classes)
for det_bboxes, det_labels in bbox_list
]
elif len(bbox_list[0]) == 3:
if output_results == 2:
bbox_results = [
bbox2result(
det_bboxes,
det_labels,
self.bbox_head.num_classes,
kps=det_kps,
num_kps=self.bbox_head.NK)
for det_bboxes, det_labels, det_kps in bbox_list
]
elif output_results == 1:
bbox_results = [
bbox2result(det_bboxes, det_labels,
self.bbox_head.num_classes)
for det_bboxes, det_labels, _ in bbox_list
]
else:
bbox_results = None
if var is not None:
return bbox_results, var
else:
return bbox_results

def feature_test(self, img):
x = self.extract_feat(img)
outs = self.bbox_head(x)
return outs

+ 67
- 0
modelscope/models/cv/face_detection/scrfd/tinymog_detect.py View File

@@ -0,0 +1,67 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp
from copy import deepcopy
from typing import Any, Dict

import torch

from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

logger = get_logger()

__all__ = ['TinyMogDetect']


@MODELS.register_module(Tasks.face_detection, module_name=Models.tinymog)
class TinyMogDetect(TorchModel):

def __init__(self, model_dir, *args, **kwargs):
"""
initialize the tinymog face detection model from the `model_dir` path.
"""
super().__init__(model_dir)
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint
from mmdet.models import build_detector
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
cfg = Config.fromfile(osp.join(model_dir, 'mmcv_tinymog.py'))
ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
detector = build_detector(cfg.model)
logger.info(f'loading model from {ckpt_path}')
load_checkpoint(detector, ckpt_path, map_location='cpu')
detector = MMDataParallel(detector)
detector.eval()
self.detector = detector
logger.info('load model done')

def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
result = self.detector(
return_loss=False,
rescale=True,
img=[input['img'][0].unsqueeze(0)],
img_metas=[[dict(input['img_metas'][0].data)]],
output_results=2)
assert result is not None
result = result[0][0]
bboxes = result[:, :4].tolist()
kpss = result[:, 5:].tolist()
scores = result[:, 4].tolist()
return {
OutputKeys.SCORES: scores,
OutputKeys.BOXES: bboxes,
OutputKeys.KEYPOINTS: kpss
}

def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
return input

+ 200
- 0
modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py View File

@@ -0,0 +1,200 @@
# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
# https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/backbones/iresnet.py
import torch
from torch import nn
from torch.utils.checkpoint import checkpoint

using_ckpt = False


def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
bias=False,
dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(
in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class IBasicBlock(nn.Module):
expansion = 1

def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1):
super(IBasicBlock, self).__init__()
if groups != 1 or base_width != 64:
raise ValueError(
'BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError(
'Dilation > 1 not supported in BasicBlock')
self.bn1 = nn.BatchNorm2d(
inplanes,
eps=1e-05,
)
self.conv1 = conv3x3(inplanes, planes)
self.bn2 = nn.BatchNorm2d(
planes,
eps=1e-05,
)
self.prelu = nn.PReLU(planes)
self.conv2 = conv3x3(planes, planes, stride)
self.bn3 = nn.BatchNorm2d(
planes,
eps=1e-05,
)
self.downsample = downsample
self.stride = stride

def forward(self, x):
identity = x
out = self.bn1(x)
out = self.conv1(out)
out = self.bn2(out)
out = self.prelu(out)
out = self.conv2(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
return out


class IResNet(nn.Module):
fc_scale = 7 * 7

def __init__(self,
block,
layers,
dropout=0,
num_features=512,
zero_init_residual=False,
groups=1,
width_per_group=64,
replace_stride_with_dilation=None,
fp16=False):
super(IResNet, self).__init__()
self.extra_gflops = 0.0
self.fp16 = fp16
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError('replace_stride_with_dilation should be None '
'or a 3-element tuple, got {}'.format(
replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(
3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
self.prelu = nn.PReLU(self.inplanes)
self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
self.layer2 = self._make_layer(
block,
128,
layers[1],
stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(
block,
256,
layers[2],
stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(
block,
512,
layers[3],
stride=2,
dilate=replace_stride_with_dilation[2])
self.bn2 = nn.BatchNorm2d(
512 * block.expansion,
eps=1e-05,
)
self.dropout = nn.Dropout(p=dropout, inplace=True)
self.fc = nn.Linear(512 * block.expansion * self.fc_scale,
num_features)
self.features = nn.BatchNorm1d(num_features, eps=1e-05)
nn.init.constant_(self.features.weight, 1.0)
self.features.weight.requires_grad = False

for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.normal_(m.weight, 0, 0.1)
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)

if zero_init_residual:
for m in self.modules():
if isinstance(m, IBasicBlock):
nn.init.constant_(m.bn2.weight, 0)

def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(
planes * block.expansion,
eps=1e-05,
),
)
layers = []
layers.append(
block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
dilation=self.dilation))

return nn.Sequential(*layers)

def forward(self, x):
with torch.cuda.amp.autocast(self.fp16):
x = self.conv1(x)
x = self.bn1(x)
x = self.prelu(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.bn2(x)
x = torch.flatten(x, 1)
x = self.dropout(x)
x = self.fc(x.float() if self.fp16 else x)
x = self.features(x)
return x


def _iresnet(arch, layers):
model = IResNet(IBasicBlock, layers)
return model

+ 213
- 0
modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py View File

@@ -0,0 +1,213 @@
# The implementation is adopted from InsightFace, made pubicly available under the Apache-2.0 license at
# https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py

from collections import namedtuple

import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import (AdaptiveAvgPool2d, AvgPool2d, BatchNorm1d, BatchNorm2d,
Conv2d, Dropout, Dropout2d, Linear, MaxPool2d, Module,
Parameter, PReLU, ReLU, Sequential, Sigmoid)


class Flatten(Module):

def forward(self, input):
return input.view(input.size(0), -1)


class SEModule(Module):

def __init__(self, channels, reduction):
super(SEModule, self).__init__()
self.avg_pool = AdaptiveAvgPool2d(1)
self.fc1 = Conv2d(
channels,
channels // reduction,
kernel_size=1,
padding=0,
bias=False)
self.relu = ReLU(inplace=True)
self.fc2 = Conv2d(
channels // reduction,
channels,
kernel_size=1,
padding=0,
bias=False)
self.sigmoid = Sigmoid()

def forward(self, x):
module_input = x
x = self.avg_pool(x)
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.sigmoid(x)
return module_input * x


class BottleneckIR(Module):

def __init__(self, in_channel, depth, stride):
super(BottleneckIR, self).__init__()
if in_channel == depth:
self.shortcut_layer = MaxPool2d(1, stride)
else:
self.shortcut_layer = Sequential(
Conv2d(in_channel, depth, (1, 1), stride, bias=False),
BatchNorm2d(depth))
self.res_layer = Sequential(
BatchNorm2d(in_channel),
Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
BatchNorm2d(depth))

def forward(self, x):
shortcut = self.shortcut_layer(x)
res = self.res_layer(x)
return res + shortcut


class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
'''A named tuple describing a ResNet block.'''


def get_block(in_channel, depth, num_units, stride=2):
return [Bottleneck(in_channel, depth, stride)
] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]


def get_blocks(num_layers):
if num_layers == 50:
blocks = [
get_block(in_channel=64, depth=64, num_units=3),
get_block(in_channel=64, depth=128, num_units=4),
get_block(in_channel=128, depth=256, num_units=14),
get_block(in_channel=256, depth=512, num_units=3)
]
elif num_layers == 100:
blocks = [
get_block(in_channel=64, depth=64, num_units=3),
get_block(in_channel=64, depth=128, num_units=13),
get_block(in_channel=128, depth=256, num_units=30),
get_block(in_channel=256, depth=512, num_units=3)
]
elif num_layers == 152:
blocks = [
get_block(in_channel=64, depth=64, num_units=3),
get_block(in_channel=64, depth=128, num_units=8),
get_block(in_channel=128, depth=256, num_units=36),
get_block(in_channel=256, depth=512, num_units=3)
]
elif num_layers == 252:
blocks = [
get_block(in_channel=64, depth=64, num_units=6),
get_block(in_channel=64, depth=128, num_units=21),
get_block(in_channel=128, depth=256, num_units=66),
get_block(in_channel=256, depth=512, num_units=6)
]
return blocks


class IResNet(Module):

def __init__(self,
dropout=0,
num_features=512,
zero_init_residual=False,
groups=1,
width_per_group=64,
replace_stride_with_dilation=None,
fp16=False,
with_wcd=False,
wrs_M=400,
wrs_q=0.9):
super(IResNet, self).__init__()
num_layers = 252
mode = 'ir'
assert num_layers in [50, 100, 152,
252], 'num_layers should be 50,100, or 152'
assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
self.fc_scale = 7 * 7
num_features = 512
self.fp16 = fp16
drop_ratio = 0.0
self.with_wcd = with_wcd
if self.with_wcd:
self.wrs_M = wrs_M
self.wrs_q = wrs_q
blocks = get_blocks(num_layers)
if mode == 'ir':
unit_module = BottleneckIR
self.input_layer = Sequential(
Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
PReLU(64))
self.bn2 = nn.BatchNorm2d(
512,
eps=1e-05,
)
self.dropout = nn.Dropout(p=drop_ratio, inplace=True)
self.fc = nn.Linear(512 * self.fc_scale, num_features)
self.features = nn.BatchNorm1d(num_features, eps=1e-05)
nn.init.constant_(self.features.weight, 1.0)
self.features.weight.requires_grad = False

modules = []
for block in blocks:
for bottleneck in block:
modules.append(
unit_module(bottleneck.in_channel, bottleneck.depth,
bottleneck.stride))
self.body = Sequential(*modules)

def forward(self, x):
with torch.cuda.amp.autocast(self.fp16):
x = self.input_layer(x)
x = self.body(x)
x = self.bn2(x)
if self.with_wcd:
B = x.size()[0]
C = x.size()[1]
x_abs = torch.abs(x)
score = torch.nn.functional.adaptive_avg_pool2d(x_abs,
1).reshape(
(B, C))
r = torch.rand((B, C), device=x.device)
key = torch.pow(r, 1. / score)
_, topidx = torch.topk(key, self.wrs_M, dim=1)
mask = torch.zeros_like(key, dtype=torch.float32)
mask.scatter_(1, topidx, 1.)
maskq = torch.rand((B, C), device=x.device)
maskq_ones = torch.ones_like(maskq, dtype=torch.float32)
maskq_zeros = torch.zeros_like(maskq, dtype=torch.float32)
maskq_m = torch.where(maskq < self.wrs_q, maskq_ones,
maskq_zeros)
new_mask = mask * maskq_m
score_sum = torch.sum(score, dim=1, keepdim=True)
selected_score_sum = torch.sum(
new_mask * score, dim=1, keepdim=True)
alpha = score_sum / (selected_score_sum + 1e-6)
alpha = alpha.reshape((B, 1, 1, 1))
new_mask = new_mask.reshape((B, C, 1, 1))
x = x * new_mask * alpha
x = torch.flatten(x, 1)
x = self.dropout(x)
x = self.fc(x.float() if self.fp16 else x)
x = self.features(x)
return x


def iresnet286(pretrained=False, progress=True, **kwargs):
model = IResNet(
dropout=0,
num_features=512,
zero_init_residual=False,
groups=1,
width_per_group=64,
replace_stride_with_dilation=None,
fp16=False,
with_wcd=False,
wrs_M=400,
wrs_q=0.9)
return model

+ 20
- 0
modelscope/models/cv/facial_landmark_confidence/__init__.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING

from modelscope.utils.import_utils import LazyImportModule

if TYPE_CHECKING:
from .flc import FacialLandmarkConfidence

else:
_import_structure = {'flc': ['FacialLandmarkConfidence']}

import sys

sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

+ 2
- 0
modelscope/models/cv/facial_landmark_confidence/flc/__init__.py View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .facial_landmark_confidence import FacialLandmarkConfidence

+ 94
- 0
modelscope/models/cv/facial_landmark_confidence/flc/facial_landmark_confidence.py View File

@@ -0,0 +1,94 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os

import cv2
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from PIL import Image
from torch.autograd import Variable

from modelscope.metainfo import Models
from modelscope.models.base import Tensor, TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from .manual_landmark_net import LandmarkConfidence


@MODELS.register_module(
Tasks.facial_landmark_confidence, module_name=Models.flc)
class FacialLandmarkConfidence(TorchModel):

def __init__(self, model_path, device='cuda'):
super().__init__(model_path)
cudnn.benchmark = True
self.model_path = model_path
self.device = device
self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
ModelFile.CONFIGURATION)
self.landmark_count = 5
self.net = LandmarkConfidence(landmark_count=self.landmark_count)
self.load_model()
self.net = self.net.to(device)

def load_model(self, load_to_cpu=False):
pretrained_dict = torch.load(
self.model_path, map_location=torch.device('cpu'))['state_dict']
pretrained_dict['rp_net.binary_cls.weight'] = 32.0 * F.normalize(
pretrained_dict['rp_net.binary_cls.weight'], dim=1).t()
self.net.load_state_dict(pretrained_dict, strict=True)
self.net.eval()

def forward(self, input):
img_org = input['orig_img']
bbox = input['bbox']
img_org = img_org.cpu().numpy()

image_height = img_org.shape[0]
image_width = img_org.shape[1]
x1 = max(0, int(bbox[0]))
y1 = max(0, int(bbox[1]))
x2 = min(image_width, int(bbox[2]))
y2 = min(image_height, int(bbox[3]))
box_w = x2 - x1 + 1
box_h = y2 - y1 + 1
if box_h > box_w:
delta = box_h - box_w
dy = edy = 0
dx = delta // 2
edx = delta - dx
else:
dx = edx = 0
delta = box_w - box_h
dy = delta // 2
edy = delta - dy

cv_img = img_org[y1:y2, x1:x2]
if dx > 0 or dy > 0 or edx > 0 or edy > 0:
cv_img = cv2.copyMakeBorder(cv_img, dy, edy, dx, edx,
cv2.BORDER_CONSTANT, 0)
inter_x = cv_img.shape[1]
inter_y = cv_img.shape[0]

cv_img = cv2.resize(cv_img, (120, 120))

cv_img = cv_img.transpose((2, 0, 1))

input_blob = torch.from_numpy(cv_img[np.newaxis, :, :, :].astype(
np.float32))

tmp_conf_lms, tmp_feat, tmp_conf_resp, tmp_nose = self.net(
input_blob.to(self.device))
conf_lms = tmp_conf_lms.cpu().numpy().squeeze()
feat = tmp_feat.cpu().numpy().squeeze()

pts5pt = []
for i in range(feat.shape[0]):
if i < self.landmark_count:
pts5pt.append(feat[i] * inter_x - dx + x1)
else:
pts5pt.append(feat[i] * inter_y - dy + y1)

lm5pt = np.array(pts5pt).reshape(2, 5).T
return lm5pt, conf_lms

+ 152
- 0
modelscope/models/cv/facial_landmark_confidence/flc/manual_landmark_net.py View File

@@ -0,0 +1,152 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import math

import torch
import torch.nn.functional as F
from torch.nn import (AdaptiveAvgPool2d, BatchNorm2d, Conv2d, Linear,
MaxPool2d, Module, Parameter, ReLU, Sequential)


class LandmarkConfidence(Module):

def __init__(self, landmark_count=5):
super(LandmarkConfidence, self).__init__()
self.landmark_net = LandmarkNetD(landmark_count)
self.landmark_net.eval()
self.cls_net = ClassNet()
self.cls_net.eval()
self.rp_net = RespiratorNet()

def forward(self, x):
feat, nose_feat, lms = self.landmark_net(x)
cls_respirator, nose = self.rp_net(feat, nose_feat)
confidence = self.cls_net(feat)
return confidence, lms, cls_respirator, nose


class FC(Module):

def __init__(self, feat_dim=256, num_class=2):
super(FC, self).__init__()
self.weight = Parameter(
torch.zeros(num_class, feat_dim, dtype=torch.float32))

def forward(self, x):
cos_theta = F.linear(x, self.weight)
return F.softmax(cos_theta, dim=1)


class Flatten(Module):

def forward(self, x):
return torch.flatten(x, 1)


class RespiratorNet(Module):

def __init__(self):
super(RespiratorNet, self).__init__()
self.conv1 = Sequential(
Conv2d(48, 48, 3, 2, 1), BatchNorm2d(48), ReLU(True))
self.conv2 = AdaptiveAvgPool2d(
(1, 1)
) # Sequential(Conv2d(48, 48, 5, 1, 0), BatchNorm2d(48), ReLU(True))
self.binary_cls = FC(feat_dim=48, num_class=2)
self.nose_layer = Sequential(
Conv2d(48, 64, 3, 1, 0), BatchNorm2d(64), ReLU(True),
Conv2d(64, 64, 3, 1, 0), BatchNorm2d(64), ReLU(True), Flatten(),
Linear(64, 96), ReLU(True), Linear(96, 6))

def train(self, mode=True):
self.conv1.train(mode)
self.conv2.train(mode)
# self.nose_feat.train(mode)
self.nose_layer.train(mode)
self.binary_cls.train(mode)

def forward(self, x, y):
x = self.conv1(x)
x = self.conv2(x)
cls = self.binary_cls(torch.flatten(x, 1))
# loc = self.nose_feat(y)
loc = self.nose_layer(y)
return cls, loc


class ClassNet(Module):

def __init__(self):
super(ClassNet, self).__init__()
self.conv1 = Sequential(
Conv2d(48, 48, 3, 1, 1), BatchNorm2d(48), ReLU(True))
self.conv2 = Sequential(
Conv2d(48, 54, 3, 2, 1), BatchNorm2d(54), ReLU(True))
self.conv3 = Sequential(
Conv2d(54, 54, 5, 1, 0), BatchNorm2d(54), ReLU(True))
self.fc1 = Sequential(Flatten(), Linear(54, 54), ReLU(True))
self.fc2 = Linear(54, 1)

def forward(self, x):
y = self.conv1(x)
y = self.conv2(y)
y = self.conv3(y)
y = self.fc1(y)
y = self.fc2(y)
return y


class LandmarkNetD(Module):

def __init__(self, landmark_count=5):
super(LandmarkNetD, self).__init__()
self.conv_pre = Sequential(
Conv2d(3, 16, 5, 2, 0), BatchNorm2d(16), ReLU(True))
self.pool_pre = MaxPool2d(2, 2) # output is 29

self.conv1 = Sequential(
Conv2d(16, 32, 3, 1, 1), BatchNorm2d(32), ReLU(True),
Conv2d(32, 32, 3, 1, 1), BatchNorm2d(32), ReLU(True))
self.pool1 = MaxPool2d(2, 2) # 14

self.conv2 = Sequential(
Conv2d(32, 48, 3, 1, 0), BatchNorm2d(48), ReLU(True),
Conv2d(48, 48, 3, 1, 0), BatchNorm2d(48), ReLU(True))
self.pool2 = MaxPool2d(2, 2) # 5

self.conv3 = Sequential(
Conv2d(48, 80, 3, 1, 0), BatchNorm2d(80), ReLU(True),
Conv2d(80, 80, 3, 1, 0), BatchNorm2d(80), ReLU(True))

self.fc1 = Sequential(Linear(80, 128), ReLU(True))
self.fc2 = Sequential(Linear(128, 128), ReLU(True))

self.output = Linear(128, landmark_count * 2)

def _initialize_weights(self):
for m in self.modules():
if isinstance(m, Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()

def forward(self, x):
y = self.conv_pre(x)
y = self.pool_pre(y)
y = self.conv1(y)
y = self.pool1(y[:, :, :28, :28])
feat = self.conv2(y)
y2 = self.pool2(feat)
y = self.conv3(y2)
y = torch.flatten(y, 1)
y = self.fc1(y)
y = self.fc2(y)
y = self.output(y)
return feat, y2, y

+ 2
- 0
modelscope/models/cv/image_classification/backbones/__init__.py View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .nextvit import NextViT

+ 541
- 0
modelscope/models/cv/image_classification/backbones/nextvit.py View File

@@ -0,0 +1,541 @@
# Part of the implementation is borrowed and modified from Next-ViT,
# publicly available at https://github.com/bytedance/Next-ViT
import collections.abc
import itertools
import math
import os
import warnings
from functools import partial
from typing import Dict, Sequence

import torch
import torch.nn as nn
from einops import rearrange
from mmcls.models.backbones.base_backbone import BaseBackbone
from mmcls.models.builder import BACKBONES
from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
from mmcv.runner import BaseModule
from torch.nn.modules.batchnorm import _BatchNorm

NORM_EPS = 1e-5


def _no_grad_trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1. + math.erf(x / math.sqrt(2.))) / 2.

if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
'The distribution of values may be incorrect.',
stacklevel=2)

with torch.no_grad():
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
ll = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)

# Uniformly fill tensor with values from [ll, u], then translate to
# [2ll-1, 2u-1].
tensor.uniform_(2 * ll - 1, 2 * u - 1)

# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()

# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.))
tensor.add_(mean)

# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
return tensor


def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
return _no_grad_trunc_normal_(tensor, mean, std, a, b)


class ConvBNReLU(nn.Module):

def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1):
super(ConvBNReLU, self).__init__()
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=1,
groups=groups,
bias=False)
self.norm = nn.BatchNorm2d(out_channels, eps=NORM_EPS)
self.act = nn.ReLU(inplace=True)

def forward(self, x):
x = self.conv(x)
x = self.norm(x)
x = self.act(x)
return x


def _make_divisible(v, divisor, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v


class PatchEmbed(nn.Module):

def __init__(self, in_channels, out_channels, stride=1):
super(PatchEmbed, self).__init__()
norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS)
if stride == 2:
self.avgpool = nn.AvgPool2d((2, 2),
stride=2,
ceil_mode=True,
count_include_pad=False)
self.conv = nn.Conv2d(
in_channels, out_channels, kernel_size=1, stride=1, bias=False)
self.norm = norm_layer(out_channels)
elif in_channels != out_channels:
self.avgpool = nn.Identity()
self.conv = nn.Conv2d(
in_channels, out_channels, kernel_size=1, stride=1, bias=False)
self.norm = norm_layer(out_channels)
else:
self.avgpool = nn.Identity()
self.conv = nn.Identity()
self.norm = nn.Identity()

def forward(self, x):
return self.norm(self.conv(self.avgpool(x)))


class MHCA(nn.Module):
"""
Multi-Head Convolutional Attention
"""

def __init__(self, out_channels, head_dim):
super(MHCA, self).__init__()
norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS)
self.group_conv3x3 = nn.Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
groups=out_channels // head_dim,
bias=False)
self.norm = norm_layer(out_channels)
self.act = nn.ReLU(inplace=True)
self.projection = nn.Conv2d(
out_channels, out_channels, kernel_size=1, bias=False)

def forward(self, x):
out = self.group_conv3x3(x)
out = self.norm(out)
out = self.act(out)
out = self.projection(out)
return out


class Mlp(nn.Module):

def __init__(self,
in_features,
out_features=None,
mlp_ratio=None,
drop=0.,
bias=True):
super().__init__()
out_features = out_features or in_features
hidden_dim = _make_divisible(in_features * mlp_ratio, 32)
self.conv1 = nn.Conv2d(
in_features, hidden_dim, kernel_size=1, bias=bias)
self.act = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(
hidden_dim, out_features, kernel_size=1, bias=bias)
self.drop = nn.Dropout(drop)

def forward(self, x):
x = self.conv1(x)
x = self.act(x)
x = self.drop(x)
x = self.conv2(x)
x = self.drop(x)
return x


class NCB(nn.Module):
"""
Next Convolution Block
"""

def __init__(self,
in_channels,
out_channels,
stride=1,
path_dropout=0,
drop=0,
head_dim=32,
mlp_ratio=3):
super(NCB, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS)
assert out_channels % head_dim == 0

self.patch_embed = PatchEmbed(in_channels, out_channels, stride)
self.mhca = MHCA(out_channels, head_dim)
self.attention_path_dropout = DropPath(path_dropout)

self.norm = norm_layer(out_channels)
self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop, bias=True)
self.mlp_path_dropout = DropPath(path_dropout)
self.is_bn_merged = False

def forward(self, x):
x = self.patch_embed(x)
x = x + self.attention_path_dropout(self.mhca(x))
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged:
out = self.norm(x)
else:
out = x
x = x + self.mlp_path_dropout(self.mlp(out))
return x


class E_MHSA(nn.Module):
"""
Efficient Multi-Head Self Attention
"""

def __init__(self,
dim,
out_dim=None,
head_dim=32,
qkv_bias=True,
qk_scale=None,
attn_drop=0,
proj_drop=0.,
sr_ratio=1):
super().__init__()
self.dim = dim
self.out_dim = out_dim if out_dim is not None else dim
self.num_heads = self.dim // head_dim
self.scale = qk_scale or head_dim**-0.5
self.q = nn.Linear(dim, self.dim, bias=qkv_bias)
self.k = nn.Linear(dim, self.dim, bias=qkv_bias)
self.v = nn.Linear(dim, self.dim, bias=qkv_bias)
self.proj = nn.Linear(self.dim, self.out_dim)
self.attn_drop = nn.Dropout(attn_drop)
self.proj_drop = nn.Dropout(proj_drop)

self.sr_ratio = sr_ratio
self.N_ratio = sr_ratio**2
if sr_ratio > 1:
self.sr = nn.AvgPool1d(
kernel_size=self.N_ratio, stride=self.N_ratio)
self.norm = nn.BatchNorm1d(dim, eps=NORM_EPS)
self.is_bn_merge = False

def forward(self, x):
B, N, C = x.shape
q = self.q(x)
q = q.reshape(B, N, self.num_heads,
int(C // self.num_heads)).permute(0, 2, 1, 3)

if self.sr_ratio > 1:
x_ = x.transpose(1, 2)
x_ = self.sr(x_)
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merge:
x_ = self.norm(x_)
x_ = x_.transpose(1, 2)
k = self.k(x_)
k = k.reshape(B, -1, self.num_heads,
int(C // self.num_heads)).permute(0, 2, 3, 1)
v = self.v(x_)
v = v.reshape(B, -1, self.num_heads,
int(C // self.num_heads)).permute(0, 2, 1, 3)
else:
k = self.k(x)
k = k.reshape(B, -1, self.num_heads,
int(C // self.num_heads)).permute(0, 2, 3, 1)
v = self.v(x)
v = v.reshape(B, -1, self.num_heads,
int(C // self.num_heads)).permute(0, 2, 1, 3)
attn = (q @ k) * self.scale

attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)

x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x


class NTB(nn.Module):
"""
Next Transformer Block
"""

def __init__(
self,
in_channels,
out_channels,
path_dropout,
stride=1,
sr_ratio=1,
mlp_ratio=2,
head_dim=32,
mix_block_ratio=0.75,
attn_drop=0,
drop=0,
):
super(NTB, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.mix_block_ratio = mix_block_ratio
norm_func = partial(nn.BatchNorm2d, eps=NORM_EPS)

self.mhsa_out_channels = _make_divisible(
int(out_channels * mix_block_ratio), 32)
self.mhca_out_channels = out_channels - self.mhsa_out_channels

self.patch_embed = PatchEmbed(in_channels, self.mhsa_out_channels,
stride)
self.norm1 = norm_func(self.mhsa_out_channels)
self.e_mhsa = E_MHSA(
self.mhsa_out_channels,
head_dim=head_dim,
sr_ratio=sr_ratio,
attn_drop=attn_drop,
proj_drop=drop)
self.mhsa_path_dropout = DropPath(path_dropout * mix_block_ratio)

self.projection = PatchEmbed(
self.mhsa_out_channels, self.mhca_out_channels, stride=1)
self.mhca = MHCA(self.mhca_out_channels, head_dim=head_dim)
self.mhca_path_dropout = DropPath(path_dropout * (1 - mix_block_ratio))

self.norm2 = norm_func(out_channels)
self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop)
self.mlp_path_dropout = DropPath(path_dropout)

self.is_bn_merged = False

def forward(self, x):
x = self.patch_embed(x)
B, C, H, W = x.shape
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged:
out = self.norm1(x)
else:
out = x
out = rearrange(out, 'b c h w -> b (h w) c') # b n c
out = self.mhsa_path_dropout(self.e_mhsa(out))
x = x + rearrange(out, 'b (h w) c -> b c h w', h=H)

out = self.projection(x)
out = out + self.mhca_path_dropout(self.mhca(out))
x = torch.cat([x, out], dim=1)

if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged:
out = self.norm2(x)
else:
out = x
x = x + self.mlp_path_dropout(self.mlp(out))
return x


@BACKBONES.register_module()
class NextViT(BaseBackbone):
stem_chs = {
'x_small': [64, 32, 64],
'small': [64, 32, 64],
'base': [64, 32, 64],
'large': [64, 32, 64],
}
depths = {
'x_small': [1, 1, 5, 1],
'small': [3, 4, 10, 3],
'base': [3, 4, 20, 3],
'large': [3, 4, 30, 3],
}

def __init__(self,
arch='small',
path_dropout=0.2,
attn_drop=0,
drop=0,
strides=[1, 2, 2, 2],
sr_ratios=[8, 4, 2, 1],
head_dim=32,
mix_block_ratio=0.75,
resume='',
with_extra_norm=True,
norm_eval=False,
norm_cfg=None,
out_indices=-1,
frozen_stages=-1,
init_cfg=None):
super().__init__(init_cfg=init_cfg)

stem_chs = self.stem_chs[arch]
depths = self.depths[arch]

self.frozen_stages = frozen_stages
self.with_extra_norm = with_extra_norm
self.norm_eval = norm_eval
self.stage1_out_channels = [96] * (depths[0])
self.stage2_out_channels = [192] * (depths[1] - 1) + [256]
self.stage3_out_channels = [384, 384, 384, 384, 512] * (depths[2] // 5)
self.stage4_out_channels = [768] * (depths[3] - 1) + [1024]
self.stage_out_channels = [
self.stage1_out_channels, self.stage2_out_channels,
self.stage3_out_channels, self.stage4_out_channels
]

# Next Hybrid Strategy
self.stage1_block_types = [NCB] * depths[0]
self.stage2_block_types = [NCB] * (depths[1] - 1) + [NTB]
self.stage3_block_types = [NCB, NCB, NCB, NCB, NTB] * (depths[2] // 5)
self.stage4_block_types = [NCB] * (depths[3] - 1) + [NTB]
self.stage_block_types = [
self.stage1_block_types, self.stage2_block_types,
self.stage3_block_types, self.stage4_block_types
]

self.stem = nn.Sequential(
ConvBNReLU(3, stem_chs[0], kernel_size=3, stride=2),
ConvBNReLU(stem_chs[0], stem_chs[1], kernel_size=3, stride=1),
ConvBNReLU(stem_chs[1], stem_chs[2], kernel_size=3, stride=1),
ConvBNReLU(stem_chs[2], stem_chs[2], kernel_size=3, stride=2),
)
input_channel = stem_chs[-1]
features = []
idx = 0
dpr = [x.item() for x in torch.linspace(0, path_dropout, sum(depths))
] # stochastic depth decay rule
for stage_id in range(len(depths)):
numrepeat = depths[stage_id]
output_channels = self.stage_out_channels[stage_id]
block_types = self.stage_block_types[stage_id]
for block_id in range(numrepeat):
if strides[stage_id] == 2 and block_id == 0:
stride = 2
else:
stride = 1
output_channel = output_channels[block_id]
block_type = block_types[block_id]
if block_type is NCB:
layer = NCB(
input_channel,
output_channel,
stride=stride,
path_dropout=dpr[idx + block_id],
drop=drop,
head_dim=head_dim)
features.append(layer)
elif block_type is NTB:
layer = NTB(
input_channel,
output_channel,
path_dropout=dpr[idx + block_id],
stride=stride,
sr_ratio=sr_ratios[stage_id],
head_dim=head_dim,
mix_block_ratio=mix_block_ratio,
attn_drop=attn_drop,
drop=drop)
features.append(layer)
input_channel = output_channel
idx += numrepeat
self.features = nn.Sequential(*features)
self.norm = nn.BatchNorm2d(output_channel, eps=NORM_EPS)

if isinstance(out_indices, int):
out_indices = [out_indices]
assert isinstance(out_indices, Sequence), \
f'"out_indices" must by a sequence or int, ' \
f'get {type(out_indices)} instead.'
for i, index in enumerate(out_indices):
if index < 0:
out_indices[i] = sum(depths) + index
assert out_indices[i] >= 0, f'Invalid out_indices {index}'
self.stage_out_idx = out_indices

if norm_cfg is not None:
self = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)

def init_weights(self):
super(NextViT, self).init_weights()
if (isinstance(self.init_cfg, dict)
and self.init_cfg['type'] == 'Pretrained'):
# Suppress default init if use pretrained model.
return

self._initialize_weights()

def _initialize_weights(self):
for n, m in self.named_modules():
if isinstance(m, (nn.BatchNorm2d,
nn.BatchNorm1d)): # nn.GroupNorm, nn.LayerNorm,
nn.init.constant_(m.weight, 1.0)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if hasattr(m, 'bias') and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Conv2d):
trunc_normal_(m.weight, std=.02)
if hasattr(m, 'bias') and m.bias is not None:
nn.init.constant_(m.bias, 0)

def forward(self, x):
outputs = list()
x = self.stem(x)
stage_id = 0
for idx, layer in enumerate(self.features):
x = layer(x)
if idx == self.stage_out_idx[stage_id]:
if self.with_extra_norm:
x = self.norm(x)
outputs.append(x)
stage_id += 1
return tuple(outputs)

def _freeze_stages(self):
if self.frozen_stages > 0:
self.stem.eval()
for param in self.stem.parameters():
param.requires_grad = False
for idx, layer in enumerate(self.features):
if idx <= self.stage_out_idx[self.frozen_stages - 1]:
layer.eval()
for param in layer.parameters():
param.requires_grad = False

def train(self, mode=True):
super(NextViT, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
# trick: eval have effect on BatchNorm only
if isinstance(m, _BatchNorm):
m.eval()

+ 24
- 8
modelscope/models/cv/image_classification/mmcls_model.py View File

@@ -1,9 +1,10 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import Tasks
from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
@@ -13,16 +14,25 @@ class ClassificationModel(TorchModel):
def __init__(self, model_dir: str, **kwargs):
import mmcv
from mmcls.models import build_classifier
import modelscope.models.cv.image_classification.backbones
from modelscope.utils.hub import read_config

super().__init__(model_dir)

config = os.path.join(model_dir, 'config.py')

cfg = mmcv.Config.fromfile(config)
cfg.model.pretrained = None
self.cls_model = build_classifier(cfg.model)

self.config_type = 'ms_config'
mm_config = os.path.join(model_dir, 'config.py')
if os.path.exists(mm_config):
cfg = mmcv.Config.fromfile(mm_config)
cfg.model.pretrained = None
self.cls_model = build_classifier(cfg.model)
self.config_type = 'mmcv_config'
else:
cfg = read_config(model_dir)
cfg.model.mm_model.pretrained = None
self.cls_model = build_classifier(cfg.model.mm_model)
self.config_type = 'ms_config'
self.cfg = cfg

self.ms_model_dir = model_dir

self.load_pretrained_checkpoint()
@@ -33,7 +43,13 @@ class ClassificationModel(TorchModel):

def load_pretrained_checkpoint(self):
import mmcv
checkpoint_path = os.path.join(self.ms_model_dir, 'checkpoints.pth')
if os.path.exists(
os.path.join(self.ms_model_dir, ModelFile.TORCH_MODEL_FILE)):
checkpoint_path = os.path.join(self.ms_model_dir,
ModelFile.TORCH_MODEL_FILE)
else:
checkpoint_path = os.path.join(self.ms_model_dir,
'checkpoints.pth')
if os.path.exists(checkpoint_path):
checkpoint = mmcv.runner.load_checkpoint(
self.cls_model, checkpoint_path, map_location='cpu')


+ 100
- 0
modelscope/models/cv/image_classification/utils.py View File

@@ -0,0 +1,100 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp

import numpy as np
from mmcls.datasets.base_dataset import BaseDataset


def get_trained_checkpoints_name(work_path):
import os
file_list = os.listdir(work_path)
last = 0
model_name = None
# find the best model
if model_name is None:
for f_name in file_list:
if 'best_' in f_name and f_name.endswith('.pth'):
best_epoch = f_name.replace('.pth', '').split('_')[-1]
if best_epoch.isdigit():
last = int(best_epoch)
model_name = f_name
return model_name
# or find the latest model
if model_name is None:
for f_name in file_list:
if 'epoch_' in f_name and f_name.endswith('.pth'):
epoch_num = f_name.replace('epoch_', '').replace('.pth', '')
if not epoch_num.isdigit():
continue
ind = int(epoch_num)
if ind > last:
last = ind
model_name = f_name
return model_name


def preprocess_transform(cfgs):
if cfgs is None:
return None
for i, cfg in enumerate(cfgs):
if cfg.type == 'Resize':
if isinstance(cfg.size, list):
cfgs[i].size = tuple(cfg.size)
return cfgs


def get_ms_dataset_root(ms_dataset):
if ms_dataset is None or len(ms_dataset) < 1:
return None
try:
data_root = ms_dataset[0]['image:FILE'].split('extracted')[0]
path_post = ms_dataset[0]['image:FILE'].split('extracted')[1].split(
'/')
extracted_data_root = osp.join(data_root, 'extracted', path_post[1],
path_post[2])
return extracted_data_root
except Exception as e:
raise ValueError(f'Dataset Error: {e}')
return None


def get_classes(classes=None):
import mmcv
if isinstance(classes, str):
# take it as a file path
class_names = mmcv.list_from_file(classes)
elif isinstance(classes, (tuple, list)):
class_names = classes
else:
raise ValueError(f'Unsupported type {type(classes)} of classes.')

return class_names


class MmDataset(BaseDataset):

def __init__(self, ms_dataset, pipeline, classes=None, test_mode=False):
self.ms_dataset = ms_dataset
if len(self.ms_dataset) < 1:
raise ValueError('Dataset Error: dataset is empty')
super(MmDataset, self).__init__(
data_prefix='',
pipeline=pipeline,
classes=classes,
test_mode=test_mode)

def load_annotations(self):
if self.CLASSES is None:
raise ValueError(
f'Dataset Error: Not found classesname.txt: {self.CLASSES}')

data_infos = []
for data_info in self.ms_dataset:
filename = data_info['image:FILE']
gt_label = data_info['category']
info = {'img_prefix': self.data_prefix}
info['img_info'] = {'filename': filename}
info['gt_label'] = np.array(gt_label, dtype=np.int64)
data_infos.append(info)

return data_infos

+ 1
- 0
modelscope/models/cv/image_depth_estimation/__init__.py View File

@@ -0,0 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

+ 1
- 0
modelscope/models/cv/image_depth_estimation/networks/__init__.py View File

@@ -0,0 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

+ 215
- 0
modelscope/models/cv/image_depth_estimation/networks/newcrf_depth.py View File

@@ -0,0 +1,215 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F

from .newcrf_layers import NewCRF
from .swin_transformer import SwinTransformer
from .uper_crf_head import PSP


class NewCRFDepth(nn.Module):
"""
Depth network based on neural window FC-CRFs architecture.
"""

def __init__(self,
version=None,
inv_depth=False,
pretrained=None,
frozen_stages=-1,
min_depth=0.1,
max_depth=100.0,
**kwargs):
super().__init__()

self.inv_depth = inv_depth
self.with_auxiliary_head = False
self.with_neck = False

norm_cfg = dict(type='BN', requires_grad=True)
# norm_cfg = dict(type='GN', requires_grad=True, num_groups=8)

window_size = int(version[-2:])

if version[:-2] == 'base':
embed_dim = 128
depths = [2, 2, 18, 2]
num_heads = [4, 8, 16, 32]
in_channels = [128, 256, 512, 1024]
elif version[:-2] == 'large':
embed_dim = 192
depths = [2, 2, 18, 2]
num_heads = [6, 12, 24, 48]
in_channels = [192, 384, 768, 1536]
elif version[:-2] == 'tiny':
embed_dim = 96
depths = [2, 2, 6, 2]
num_heads = [3, 6, 12, 24]
in_channels = [96, 192, 384, 768]

backbone_cfg = dict(
embed_dim=embed_dim,
depths=depths,
num_heads=num_heads,
window_size=window_size,
ape=False,
drop_path_rate=0.3,
patch_norm=True,
use_checkpoint=False,
frozen_stages=frozen_stages)

embed_dim = 512
decoder_cfg = dict(
in_channels=in_channels,
in_index=[0, 1, 2, 3],
pool_scales=(1, 2, 3, 6),
channels=embed_dim,
dropout_ratio=0.0,
num_classes=32,
norm_cfg=norm_cfg,
align_corners=False)

self.backbone = SwinTransformer(**backbone_cfg)
# v_dim = decoder_cfg['num_classes'] * 4
win = 7
crf_dims = [128, 256, 512, 1024]
v_dims = [64, 128, 256, embed_dim]
self.crf3 = NewCRF(
input_dim=in_channels[3],
embed_dim=crf_dims[3],
window_size=win,
v_dim=v_dims[3],
num_heads=32)
self.crf2 = NewCRF(
input_dim=in_channels[2],
embed_dim=crf_dims[2],
window_size=win,
v_dim=v_dims[2],
num_heads=16)
self.crf1 = NewCRF(
input_dim=in_channels[1],
embed_dim=crf_dims[1],
window_size=win,
v_dim=v_dims[1],
num_heads=8)
self.crf0 = NewCRF(
input_dim=in_channels[0],
embed_dim=crf_dims[0],
window_size=win,
v_dim=v_dims[0],
num_heads=4)

self.decoder = PSP(**decoder_cfg)
self.disp_head1 = DispHead(input_dim=crf_dims[0])

self.up_mode = 'bilinear'
if self.up_mode == 'mask':
self.mask_head = nn.Sequential(
nn.Conv2d(crf_dims[0], 64, 3, padding=1),
nn.ReLU(inplace=True), nn.Conv2d(64, 16 * 9, 1, padding=0))

self.min_depth = min_depth
self.max_depth = max_depth

self.init_weights(pretrained=pretrained)

def init_weights(self, pretrained=None):
"""Initialize the weights in backbone and heads.

Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
# print(f'== Load encoder backbone from: {pretrained}')
self.backbone.init_weights(pretrained=pretrained)
self.decoder.init_weights()
if self.with_auxiliary_head:
if isinstance(self.auxiliary_head, nn.ModuleList):
for aux_head in self.auxiliary_head:
aux_head.init_weights()
else:
self.auxiliary_head.init_weights()

def upsample_mask(self, disp, mask):
""" Upsample disp [H/4, W/4, 1] -> [H, W, 1] using convex combination """
N, _, H, W = disp.shape
mask = mask.view(N, 1, 9, 4, 4, H, W)
mask = torch.softmax(mask, dim=2)

up_disp = F.unfold(disp, kernel_size=3, padding=1)
up_disp = up_disp.view(N, 1, 9, 1, 1, H, W)

up_disp = torch.sum(mask * up_disp, dim=2)
up_disp = up_disp.permute(0, 1, 4, 2, 5, 3)
return up_disp.reshape(N, 1, 4 * H, 4 * W)

def forward(self, imgs):

feats = self.backbone(imgs)
if self.with_neck:
feats = self.neck(feats)

ppm_out = self.decoder(feats)

e3 = self.crf3(feats[3], ppm_out)
e3 = nn.PixelShuffle(2)(e3)
e2 = self.crf2(feats[2], e3)
e2 = nn.PixelShuffle(2)(e2)
e1 = self.crf1(feats[1], e2)
e1 = nn.PixelShuffle(2)(e1)
e0 = self.crf0(feats[0], e1)

if self.up_mode == 'mask':
mask = self.mask_head(e0)
d1 = self.disp_head1(e0, 1)
d1 = self.upsample_mask(d1, mask)
else:
d1 = self.disp_head1(e0, 4)

depth = d1 * self.max_depth

return depth


class DispHead(nn.Module):

def __init__(self, input_dim=100):
super(DispHead, self).__init__()
# self.norm1 = nn.BatchNorm2d(input_dim)
self.conv1 = nn.Conv2d(input_dim, 1, 3, padding=1)
# self.relu = nn.ReLU(inplace=True)
self.sigmoid = nn.Sigmoid()

def forward(self, x, scale):
# x = self.relu(self.norm1(x))
x = self.sigmoid(self.conv1(x))
if scale > 1:
x = upsample(x, scale_factor=scale)
return x


class DispUnpack(nn.Module):

def __init__(self, input_dim=100, hidden_dim=128):
super(DispUnpack, self).__init__()
self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
self.conv2 = nn.Conv2d(hidden_dim, 16, 3, padding=1)
self.relu = nn.ReLU(inplace=True)
self.sigmoid = nn.Sigmoid()
self.pixel_shuffle = nn.PixelShuffle(4)

def forward(self, x, output_size):
x = self.relu(self.conv1(x))
x = self.sigmoid(self.conv2(x)) # [b, 16, h/4, w/4]
# x = torch.reshape(x, [x.shape[0], 1, x.shape[2]*4, x.shape[3]*4])
x = self.pixel_shuffle(x)

return x


def upsample(x, scale_factor=2, mode='bilinear', align_corners=False):
"""Upsample input tensor by a factor of 2
"""
return F.interpolate(
x, scale_factor=scale_factor, mode=mode, align_corners=align_corners)

+ 504
- 0
modelscope/models/cv/image_depth_estimation/networks/newcrf_layers.py View File

@@ -0,0 +1,504 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_


class Mlp(nn.Module):
""" Multilayer perceptron."""

def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)

def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x


def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size

Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size,
C)
windows = x.permute(0, 1, 3, 2, 4,
5).contiguous().view(-1, window_size, window_size, C)
return windows


def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image

Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size,
window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x


class WindowAttention(nn.Module):
""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.

Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""

def __init__(self,
dim,
window_size,
num_heads,
v_dim,
qkv_bias=True,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):

super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5

# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads)) # 2*Wh-1 * 2*Ww-1, nH

# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :,
0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer('relative_position_index',
relative_position_index)

self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(v_dim, v_dim)
self.proj_drop = nn.Dropout(proj_drop)

trunc_normal_(self.relative_position_bias_table, std=.02)
self.softmax = nn.Softmax(dim=-1)

def forward(self, x, v, mask=None):
""" Forward function.

Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qk = self.qk(x).reshape(B_, N, 2, self.num_heads,
C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k = qk[0], qk[
1] # make torchscript happy (cannot use tensor as tuple)

q = q * self.scale
attn = (q @ k.transpose(-2, -1))

relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1],
-1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)

if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N,
N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)

attn = self.attn_drop(attn)

# assert self.dim % v.shape[-1] == 0, "self.dim % v.shape[-1] != 0"
# repeat_num = self.dim // v.shape[-1]
# v = v.view(B_, N, self.num_heads // repeat_num, -1).transpose(1, 2).repeat(1, repeat_num, 1, 1)

assert self.dim == v.shape[-1], 'self.dim != v.shape[-1]'
v = v.view(B_, N, self.num_heads, -1).transpose(1, 2)

x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x


class CRFBlock(nn.Module):
""" CRF Block.

Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""

def __init__(self,
dim,
num_heads,
v_dim,
window_size=7,
shift_size=0,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.v_dim = v_dim
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
v_dim=v_dim,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)

self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(v_dim)
mlp_hidden_dim = int(v_dim * mlp_ratio)
self.mlp = Mlp(
in_features=v_dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)

self.H = None
self.W = None

def forward(self, x, v, mask_matrix):
""" Forward function.

Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, 'input feature has wrong size'

shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)

# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
v = F.pad(v, (0, 0, pad_l, pad_r, pad_t, pad_b))
_, Hp, Wp, _ = x.shape

# cyclic shift
if self.shift_size > 0:
shifted_x = torch.roll(
x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
shifted_v = torch.roll(
v, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
shifted_v = v
attn_mask = None

# partition windows
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.view(-1, self.window_size * self.window_size,
C) # nW*B, window_size*window_size, C
v_windows = window_partition(
shifted_v, self.window_size) # nW*B, window_size, window_size, C
v_windows = v_windows.view(
-1, self.window_size * self.window_size,
v_windows.shape[-1]) # nW*B, window_size*window_size, C

# W-MSA/SW-MSA
attn_windows = self.attn(
x_windows, v_windows,
mask=attn_mask) # nW*B, window_size*window_size, C

# merge windows
attn_windows = attn_windows.view(-1, self.window_size,
self.window_size, self.v_dim)
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C

# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
dims=(1, 2))
else:
x = shifted_x

if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :].contiguous()

x = x.view(B, H * W, self.v_dim)

# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))

return x


class BasicCRFLayer(nn.Module):
""" A basic NeWCRFs layer for one stage.

Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
num_heads (int): Number of attention head.
window_size (int): Local window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""

def __init__(self,
dim,
depth,
num_heads,
v_dim,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None,
use_checkpoint=False):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
self.use_checkpoint = use_checkpoint

# build blocks
self.blocks = nn.ModuleList([
CRFBlock(
dim=dim,
num_heads=num_heads,
v_dim=v_dim,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i]
if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer) for i in range(depth)
])

# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None

def forward(self, x, v, H, W):
""" Forward function.

Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""

# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size,
-self.shift_size), slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size,
-self.shift_size), slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1

mask_windows = window_partition(
img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1,
self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0,
float(-100.0)).masked_fill(
attn_mask == 0, float(0.0))

for blk in self.blocks:
blk.H, blk.W = H, W
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, attn_mask)
else:
x = blk(x, v, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W


class NewCRF(nn.Module):

def __init__(self,
input_dim=96,
embed_dim=96,
v_dim=64,
window_size=7,
num_heads=4,
depth=2,
patch_size=4,
in_chans=3,
norm_layer=nn.LayerNorm,
patch_norm=True):
super().__init__()

self.embed_dim = embed_dim
self.patch_norm = patch_norm

if input_dim != embed_dim:
self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1)
else:
self.proj_x = None

if v_dim != embed_dim:
self.proj_v = nn.Conv2d(v_dim, embed_dim, 3, padding=1)
elif embed_dim % v_dim == 0:
self.proj_v = None

v_dim = embed_dim
assert v_dim == embed_dim

self.crf_layer = BasicCRFLayer(
dim=embed_dim,
depth=depth,
num_heads=num_heads,
v_dim=v_dim,
window_size=window_size,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=norm_layer,
downsample=None,
use_checkpoint=False)

layer = norm_layer(embed_dim)
layer_name = 'norm_crf'
self.add_module(layer_name, layer)

def forward(self, x, v):
if self.proj_x is not None:
x = self.proj_x(x)
if self.proj_v is not None:
v = self.proj_v(v)

Wh, Ww = x.size(2), x.size(3)
x = x.flatten(2).transpose(1, 2)
v = v.transpose(1, 2).transpose(2, 3)

x_out, H, W, x, Wh, Ww = self.crf_layer(x, v, Wh, Ww)
norm_layer = getattr(self, 'norm_crf')
x_out = norm_layer(x_out)
out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1,
2).contiguous()

return out

+ 272
- 0
modelscope/models/cv/image_depth_estimation/networks/newcrf_utils.py View File

@@ -0,0 +1,272 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import os.path as osp
import pkgutil
import warnings
from collections import OrderedDict
from importlib import import_module

import torch
import torch.nn as nn
import torchvision
from torch import distributed as dist
from torch.nn import functional as F
from torch.nn.parallel import DataParallel, DistributedDataParallel
from torch.utils import model_zoo

TORCH_VERSION = torch.__version__


def resize(input,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None,
warning=True):
if warning:
if size is not None and align_corners:
input_h, input_w = tuple(int(x) for x in input.shape[2:])
output_h, output_w = tuple(int(x) for x in size)
if output_h > input_h or output_w > output_h:
if ((output_h > 1 and output_w > 1 and input_h > 1
and input_w > 1) and (output_h - 1) % (input_h - 1)
and (output_w - 1) % (input_w - 1)):
warnings.warn(
f'When align_corners={align_corners}, '
'the output would more aligned if '
f'input size {(input_h, input_w)} is `x+1` and '
f'out size {(output_h, output_w)} is `nx+1`')
if isinstance(size, torch.Size):
size = tuple(int(x) for x in size)
return F.interpolate(input, size, scale_factor, mode, align_corners)


def normal_init(module, mean=0, std=1, bias=0):
if hasattr(module, 'weight') and module.weight is not None:
nn.init.normal_(module.weight, mean, std)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)


def is_module_wrapper(module):
module_wrappers = (DataParallel, DistributedDataParallel)
return isinstance(module, module_wrappers)


def get_dist_info():
if TORCH_VERSION < '1.0':
initialized = dist._initialized
else:
if dist.is_available():
initialized = dist.is_initialized()
else:
initialized = False
if initialized:
rank = dist.get_rank()
world_size = dist.get_world_size()
else:
rank = 0
world_size = 1
return rank, world_size


def load_state_dict(module, state_dict, strict=False, logger=None):
"""Load state_dict to a module.

This method is modified from :meth:`torch.nn.Module.load_state_dict`.
Default value for ``strict`` is set to ``False`` and the message for
param mismatch will be shown even if strict is False.

Args:
module (Module): Module that receives the state_dict.
state_dict (OrderedDict): Weights.
strict (bool): whether to strictly enforce that the keys
in :attr:`state_dict` match the keys returned by this module's
:meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
logger (:obj:`logging.Logger`, optional): Logger to log the error
message. If not specified, print function will be used.
"""
unexpected_keys = []
all_missing_keys = []
err_msg = []

metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata

# use _load_from_state_dict to enable checkpoint version control
def load(module, prefix=''):
# recursively check parallel module in case that the model has a
# complicated structure, e.g., nn.Module(nn.Module(DDP))
if is_module_wrapper(module):
module = module.module
local_metadata = {} if metadata is None else metadata.get(
prefix[:-1], {})
module._load_from_state_dict(state_dict, prefix, local_metadata, True,
all_missing_keys, unexpected_keys,
err_msg)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')

load(module)
load = None # break load->load reference cycle

# ignore "num_batches_tracked" of BN layers
missing_keys = [
key for key in all_missing_keys if 'num_batches_tracked' not in key
]

if unexpected_keys:
err_msg.append('unexpected key in source '
f'state_dict: {", ".join(unexpected_keys)}\n')
if missing_keys:
err_msg.append(
f'missing keys in source state_dict: {", ".join(missing_keys)}\n')

rank, _ = get_dist_info()
if len(err_msg) > 0 and rank == 0:
err_msg.insert(
0, 'The model and loaded state dict do not match exactly\n')
err_msg = '\n'.join(err_msg)
if strict:
raise RuntimeError(err_msg)
elif logger is not None:
logger.warning(err_msg)
else:
print(err_msg)


def load_url_dist(url, model_dir=None):
"""In distributed setting, this function only download checkpoint at local
rank 0."""
rank, world_size = get_dist_info()
rank = int(os.environ.get('LOCAL_RANK', rank))
if rank == 0:
checkpoint = model_zoo.load_url(url, model_dir=model_dir)
if world_size > 1:
torch.distributed.barrier()
if rank > 0:
checkpoint = model_zoo.load_url(url, model_dir=model_dir)
return checkpoint


def get_torchvision_models():
model_urls = dict()
for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
if ispkg:
continue
_zoo = import_module(f'torchvision.models.{name}')
if hasattr(_zoo, 'model_urls'):
_urls = getattr(_zoo, 'model_urls')
model_urls.update(_urls)
return model_urls


def _load_checkpoint(filename, map_location=None):
"""Load checkpoint from somewhere (modelzoo, file, url).

Args:
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
details.
map_location (str | None): Same as :func:`torch.load`. Default: None.

Returns:
dict | OrderedDict: The loaded checkpoint. It can be either an
OrderedDict storing model weights or a dict containing other
information, which depends on the checkpoint.
"""
if filename.startswith('modelzoo://'):
warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
'use "torchvision://" instead')
model_urls = get_torchvision_models()
model_name = filename[11:]
checkpoint = load_url_dist(model_urls[model_name])
else:
if not osp.isfile(filename):
raise IOError(f'{filename} is not a checkpoint file')
checkpoint = torch.load(filename, map_location=map_location)
return checkpoint


def load_checkpoint(model,
filename,
map_location='cpu',
strict=False,
logger=None):
"""Load checkpoint from a file or URI.

Args:
model (Module): Module to load checkpoint.
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
details.
map_location (str): Same as :func:`torch.load`.
strict (bool): Whether to allow different params for the model and
checkpoint.
logger (:mod:`logging.Logger` or None): The logger for error message.

Returns:
dict or OrderedDict: The loaded checkpoint.
"""
checkpoint = _load_checkpoint(filename, map_location)
# OrderedDict is a subclass of dict
if not isinstance(checkpoint, dict):
raise RuntimeError(
f'No state_dict found in checkpoint file {filename}')
# get state_dict from checkpoint
if 'state_dict' in checkpoint:
state_dict = checkpoint['state_dict']
elif 'model' in checkpoint:
state_dict = checkpoint['model']
else:
state_dict = checkpoint
# strip prefix of state_dict
if list(state_dict.keys())[0].startswith('module.'):
state_dict = {k[7:]: v for k, v in state_dict.items()}

# for MoBY, load model of online branch
if sorted(list(state_dict.keys()))[0].startswith('encoder'):
state_dict = {
k.replace('encoder.', ''): v
for k, v in state_dict.items() if k.startswith('encoder.')
}

# reshape absolute position embedding
if state_dict.get('absolute_pos_embed') is not None:
absolute_pos_embed = state_dict['absolute_pos_embed']
N1, L, C1 = absolute_pos_embed.size()
N2, C2, H, W = model.absolute_pos_embed.size()
if N1 != N2 or C1 != C2 or L != H * W:
logger.warning('Error in loading absolute_pos_embed, pass')
else:
state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
N2, H, W, C2).permute(0, 3, 1, 2)

# interpolate position bias table if needed
relative_position_bias_table_keys = [
k for k in state_dict.keys() if 'relative_position_bias_table' in k
]
for table_key in relative_position_bias_table_keys:
table_pretrained = state_dict[table_key]
table_current = model.state_dict()[table_key]
L1, nH1 = table_pretrained.size()
L2, nH2 = table_current.size()
if nH1 != nH2:
logger.warning(f'Error in loading {table_key}, pass')
else:
if L1 != L2:
S1 = int(L1**0.5)
S2 = int(L2**0.5)
table_pretrained_resized = F.interpolate(
table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
size=(S2, S2),
mode='bicubic')
state_dict[table_key] = table_pretrained_resized.view(
nH2, L2).permute(1, 0)

# load state_dict
load_state_dict(model, state_dict, strict, logger)
return checkpoint

+ 706
- 0
modelscope/models/cv/image_depth_estimation/networks/swin_transformer.py View File

@@ -0,0 +1,706 @@
# The implementation is adopted from Swin Transformer
# made publicly available under the MIT License at https://github.com/microsoft/Swin-Transformer

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_

from .newcrf_utils import load_checkpoint


class Mlp(nn.Module):
""" Multilayer perceptron."""

def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)

def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x


def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size

Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size,
C)
windows = x.permute(0, 1, 3, 2, 4,
5).contiguous().view(-1, window_size, window_size, C)
return windows


def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image

Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size,
window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x


class WindowAttention(nn.Module):
""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.

Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""

def __init__(self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):

super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5

# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads)) # 2*Wh-1 * 2*Ww-1, nH

# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :,
0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer('relative_position_index',
relative_position_index)

self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)

trunc_normal_(self.relative_position_bias_table, std=.02)
self.softmax = nn.Softmax(dim=-1)

def forward(self, x, mask=None):
""" Forward function.

Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[
2] # make torchscript happy (cannot use tensor as tuple)

q = q * self.scale
attn = (q @ k.transpose(-2, -1))

relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1],
-1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)

if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N,
N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)

attn = self.attn_drop(attn)

x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x


class SwinTransformerBlock(nn.Module):
""" Swin Transformer Block.

Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""

def __init__(self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)

self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(
in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)

self.H = None
self.W = None

def forward(self, x, mask_matrix):
""" Forward function.

Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, 'input feature has wrong size'

shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)

# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
_, Hp, Wp, _ = x.shape

# cyclic shift
if self.shift_size > 0:
shifted_x = torch.roll(
x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None

# partition windows
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.view(-1, self.window_size * self.window_size,
C) # nW*B, window_size*window_size, C

# W-MSA/SW-MSA
attn_windows = self.attn(
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C

# merge windows
attn_windows = attn_windows.view(-1, self.window_size,
self.window_size, C)
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C

# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
dims=(1, 2))
else:
x = shifted_x

if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :].contiguous()

x = x.view(B, H * W, C)

# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))

return x


class PatchMerging(nn.Module):
""" Patch Merging Layer

Args:
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""

def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)

def forward(self, x, H, W):
""" Forward function.

Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, 'input feature has wrong size'

x = x.view(B, H, W, C)

# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))

x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C

x = self.norm(x)
x = self.reduction(x)

return x


class BasicLayer(nn.Module):
""" A basic Swin Transformer layer for one stage.

Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
num_heads (int): Number of attention head.
window_size (int): Local window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""

def __init__(self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None,
use_checkpoint=False):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
self.use_checkpoint = use_checkpoint

# build blocks
self.blocks = nn.ModuleList([
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i]
if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer) for i in range(depth)
])

# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None

def forward(self, x, H, W):
""" Forward function.

Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""

# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size,
-self.shift_size), slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size,
-self.shift_size), slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1

mask_windows = window_partition(
img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1,
self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0,
float(-100.0)).masked_fill(
attn_mask == 0, float(0.0))

for blk in self.blocks:
blk.H, blk.W = H, W
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, attn_mask)
else:
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W


class PatchEmbed(nn.Module):
""" Image to Patch Embedding

Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""

def __init__(self,
patch_size=4,
in_chans=3,
embed_dim=96,
norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size

self.in_chans = in_chans
self.embed_dim = embed_dim

self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None

def forward(self, x):
"""Forward function."""
# padding
_, _, H, W = x.size()
if W % self.patch_size[1] != 0:
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
if H % self.patch_size[0] != 0:
x = F.pad(x,
(0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))

x = self.proj(x) # B C Wh Ww
if self.norm is not None:
Wh, Ww = x.size(2), x.size(3)
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)

return x


class SwinTransformer(nn.Module):
""" Swin Transformer backbone.
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/pdf/2103.14030

Args:
pretrain_img_size (int): Input image size for training the pretrained model,
used in absolute postion embedding. Default 224.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each Swin Transformer stage.
num_heads (tuple[int]): Number of attention head of each stage.
window_size (int): Window size. Default: 7.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
drop_rate (float): Dropout rate.
attn_drop_rate (float): Attention dropout rate. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters.
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""

def __init__(self,
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
use_checkpoint=False):
super().__init__()

self.pretrain_img_size = pretrain_img_size
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages

# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)

# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1]
]

self.absolute_pos_embed = nn.Parameter(
torch.zeros(1, embed_dim, patches_resolution[0],
patches_resolution[1]))
trunc_normal_(self.absolute_pos_embed, std=.02)

self.pos_drop = nn.Dropout(p=drop_rate)

# stochastic depth
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule

# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if
(i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint)
self.layers.append(layer)

num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features

# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_module(layer_name, layer)

self._freeze_stages()

def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False

if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.requires_grad = False

if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.requires_grad = False

def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.

Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""

def _init_weights(m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)

if isinstance(pretrained, str):
self.apply(_init_weights)
# logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False)
elif pretrained is None:
self.apply(_init_weights)
else:
raise TypeError('pretrained must be a str or None')

def forward(self, x):
"""Forward function."""
x = self.patch_embed(x)

Wh, Ww = x.size(2), x.size(3)
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
x = (x + absolute_pos_embed).flatten(2).transpose(1,
2) # B Wh*Ww C
else:
x = x.flatten(2).transpose(1, 2)
x = self.pos_drop(x)

outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)

if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)

out = x_out.view(-1, H, W,
self.num_features[i]).permute(0, 3, 1,
2).contiguous()
outs.append(out)

return tuple(outs)

def train(self, mode=True):
"""Convert the model into training mode while keep layers freezed."""
super(SwinTransformer, self).train(mode)
self._freeze_stages()

+ 365
- 0
modelscope/models/cv/image_depth_estimation/networks/uper_crf_head.py View File

@@ -0,0 +1,365 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule

from .newcrf_utils import normal_init, resize


class PPM(nn.ModuleList):
"""Pooling Pyramid Module used in PSPNet.

Args:
pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
Module.
in_channels (int): Input channels.
channels (int): Channels after modules, before conv_seg.
conv_cfg (dict|None): Config of conv layers.
norm_cfg (dict|None): Config of norm layers.
act_cfg (dict): Config of activation layers.
align_corners (bool): align_corners argument of F.interpolate.
"""

def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
act_cfg, align_corners):
super(PPM, self).__init__()
self.pool_scales = pool_scales
self.align_corners = align_corners
self.in_channels = in_channels
self.channels = channels
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
for pool_scale in pool_scales:
# == if batch size = 1, BN is not supported, change to GN
if pool_scale == 1:
norm_cfg = dict(type='GN', requires_grad=True, num_groups=256)
self.append(
nn.Sequential(
nn.AdaptiveAvgPool2d(pool_scale),
ConvModule(
self.in_channels,
self.channels,
1,
conv_cfg=self.conv_cfg,
norm_cfg=norm_cfg,
act_cfg=self.act_cfg)))

def forward(self, x):
"""Forward function."""
ppm_outs = []
for ppm in self:
ppm_out = ppm(x)
upsampled_ppm_out = resize(
ppm_out,
size=x.size()[2:],
mode='bilinear',
align_corners=self.align_corners)
ppm_outs.append(upsampled_ppm_out)
return ppm_outs


class BaseDecodeHead(nn.Module):
"""Base class for BaseDecodeHead.

Args:
in_channels (int|Sequence[int]): Input channels.
channels (int): Channels after modules, before conv_seg.
num_classes (int): Number of classes.
dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
conv_cfg (dict|None): Config of conv layers. Default: None.
norm_cfg (dict|None): Config of norm layers. Default: None.
act_cfg (dict): Config of activation layers.
Default: dict(type='ReLU')
in_index (int|Sequence[int]): Input feature index. Default: -1
input_transform (str|None): Transformation type of input features.
Options: 'resize_concat', 'multiple_select', None.
'resize_concat': Multiple feature maps will be resize to the
same size as first one and than concat together.
Usually used in FCN head of HRNet.
'multiple_select': Multiple feature maps will be bundle into
a list and passed into decode head.
None: Only one select feature map is allowed.
Default: None.
loss_decode (dict): Config of decode loss.
Default: dict(type='CrossEntropyLoss').
ignore_index (int | None): The label index to be ignored. When using
masked BCE loss, ignore_index should be set to None. Default: 255
sampler (dict|None): The config of segmentation map sampler.
Default: None.
align_corners (bool): align_corners argument of F.interpolate.
Default: False.
"""

def __init__(self,
in_channels,
channels,
*,
num_classes,
dropout_ratio=0.1,
conv_cfg=None,
norm_cfg=None,
act_cfg=dict(type='ReLU'),
in_index=-1,
input_transform=None,
loss_decode=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
ignore_index=255,
sampler=None,
align_corners=False):
super(BaseDecodeHead, self).__init__()
self._init_inputs(in_channels, in_index, input_transform)
self.channels = channels
self.num_classes = num_classes
self.dropout_ratio = dropout_ratio
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.in_index = in_index
# self.loss_decode = build_loss(loss_decode)
self.ignore_index = ignore_index
self.align_corners = align_corners
# if sampler is not None:
# self.sampler = build_pixel_sampler(sampler, context=self)
# else:
# self.sampler = None

# self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
# self.conv1 = nn.Conv2d(channels, num_classes, 3, padding=1)
if dropout_ratio > 0:
self.dropout = nn.Dropout2d(dropout_ratio)
else:
self.dropout = None
self.fp16_enabled = False

def extra_repr(self):
"""Extra repr."""
s = f'input_transform={self.input_transform}, ' \
f'ignore_index={self.ignore_index}, ' \
f'align_corners={self.align_corners}'
return s

def _init_inputs(self, in_channels, in_index, input_transform):
"""Check and initialize input transforms.

The in_channels, in_index and input_transform must match.
Specifically, when input_transform is None, only single feature map
will be selected. So in_channels and in_index must be of type int.
When input_transform

Args:
in_channels (int|Sequence[int]): Input channels.
in_index (int|Sequence[int]): Input feature index.
input_transform (str|None): Transformation type of input features.
Options: 'resize_concat', 'multiple_select', None.
'resize_concat': Multiple feature maps will be resize to the
same size as first one and than concat together.
Usually used in FCN head of HRNet.
'multiple_select': Multiple feature maps will be bundle into
a list and passed into decode head.
None: Only one select feature map is allowed.
"""

if input_transform is not None:
assert input_transform in ['resize_concat', 'multiple_select']
self.input_transform = input_transform
self.in_index = in_index
if input_transform is not None:
assert isinstance(in_channels, (list, tuple))
assert isinstance(in_index, (list, tuple))
assert len(in_channels) == len(in_index)
if input_transform == 'resize_concat':
self.in_channels = sum(in_channels)
else:
self.in_channels = in_channels
else:
assert isinstance(in_channels, int)
assert isinstance(in_index, int)
self.in_channels = in_channels

def init_weights(self):
"""Initialize weights of classification layer."""
# normal_init(self.conv_seg, mean=0, std=0.01)
# normal_init(self.conv1, mean=0, std=0.01)

def _transform_inputs(self, inputs):
"""Transform inputs for decoder.

Args:
inputs (list[Tensor]): List of multi-level img features.

Returns:
Tensor: The transformed inputs
"""

if self.input_transform == 'resize_concat':
inputs = [inputs[i] for i in self.in_index]
upsampled_inputs = [
resize(
input=x,
size=inputs[0].shape[2:],
mode='bilinear',
align_corners=self.align_corners) for x in inputs
]
inputs = torch.cat(upsampled_inputs, dim=1)
elif self.input_transform == 'multiple_select':
inputs = [inputs[i] for i in self.in_index]
else:
inputs = inputs[self.in_index]

return inputs

def forward(self, inputs):
"""Placeholder of forward function."""
pass

def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
"""Forward function for training.
Args:
inputs (list[Tensor]): List of multi-level img features.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
`mmseg/datasets/pipelines/formatting.py:Collect`.
gt_semantic_seg (Tensor): Semantic segmentation masks
used if the architecture supports semantic segmentation task.
train_cfg (dict): The training config.

Returns:
dict[str, Tensor]: a dictionary of loss components
"""
seg_logits = self.forward(inputs)
losses = self.losses(seg_logits, gt_semantic_seg)
return losses

def forward_test(self, inputs, img_metas, test_cfg):
"""Forward function for testing.

Args:
inputs (list[Tensor]): List of multi-level img features.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys see
`mmseg/datasets/pipelines/formatting.py:Collect`.
test_cfg (dict): The testing config.

Returns:
Tensor: Output segmentation map.
"""
return self.forward(inputs)


class UPerHead(BaseDecodeHead):

def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
super(UPerHead, self).__init__(
input_transform='multiple_select', **kwargs)
# FPN Module
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for in_channels in self.in_channels: # skip the top layer
l_conv = ConvModule(
in_channels,
self.channels,
1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg,
inplace=True)
fpn_conv = ConvModule(
self.channels,
self.channels,
3,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg,
inplace=True)
self.lateral_convs.append(l_conv)
self.fpn_convs.append(fpn_conv)

def forward(self, inputs):
"""Forward function."""

inputs = self._transform_inputs(inputs)

# build laterals
laterals = [
lateral_conv(inputs[i])
for i, lateral_conv in enumerate(self.lateral_convs)
]

# laterals.append(self.psp_forward(inputs))

# build top-down path
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
prev_shape = laterals[i - 1].shape[2:]
laterals[i - 1] += resize(
laterals[i],
size=prev_shape,
mode='bilinear',
align_corners=self.align_corners)

# build outputs
fpn_outs = [
self.fpn_convs[i](laterals[i])
for i in range(used_backbone_levels - 1)
]
# append psp feature
fpn_outs.append(laterals[-1])

return fpn_outs[0]


class PSP(BaseDecodeHead):
"""Unified Perceptual Parsing for Scene Understanding.

This head is the implementation of `UPerNet
<https://arxiv.org/abs/1807.10221>`_.

Args:
pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
Module applied on the last feature. Default: (1, 2, 3, 6).
"""

def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
super(PSP, self).__init__(input_transform='multiple_select', **kwargs)
# PSP Module
self.psp_modules = PPM(
pool_scales,
self.in_channels[-1],
self.channels,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg,
align_corners=self.align_corners)
self.bottleneck = ConvModule(
self.in_channels[-1] + len(pool_scales) * self.channels,
self.channels,
3,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg)

def psp_forward(self, inputs):
"""Forward function of PSP module."""
x = inputs[-1]
psp_outs = [x]
psp_outs.extend(self.psp_modules(x))
psp_outs = torch.cat(psp_outs, dim=1)
output = self.bottleneck(psp_outs)

return output

def forward(self, inputs):
"""Forward function."""
inputs = self._transform_inputs(inputs)

return self.psp_forward(inputs)

+ 53
- 0
modelscope/models/cv/image_depth_estimation/newcrfs_model.py View File

@@ -0,0 +1,53 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp

import numpy as np
import torch

from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.image_depth_estimation.networks.newcrf_depth import \
NewCRFDepth
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import ModelFile, Tasks


@MODELS.register_module(
Tasks.image_depth_estimation, module_name=Models.newcrfs_depth_estimation)
class DepthEstimation(TorchModel):

def __init__(self, model_dir: str, **kwargs):
"""str -- model file root."""
super().__init__(model_dir, **kwargs)

# build model
self.model = NewCRFDepth(
version='large07', inv_depth=False, max_depth=10)

# load model
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
checkpoint = torch.load(model_path)

state_dict = {}
for k in checkpoint['model'].keys():
if k.startswith('module.'):
state_dict[k[7:]] = checkpoint['model'][k]
else:
state_dict[k] = checkpoint['model'][k]
self.model.load_state_dict(state_dict)
self.model.eval()

def forward(self, Inputs):
return self.model(Inputs['imgs'])

def postprocess(self, Inputs):
depth_result = Inputs

results = {OutputKeys.DEPTHS: depth_result}
return results

def inference(self, data):
results = self.forward(data)

return results

+ 8
- 1
modelscope/models/cv/image_semantic_segmentation/vit_adapter/utils/seg_func.py View File

@@ -25,7 +25,14 @@ def seg_resize(input,
'the output would more aligned if '
f'input size {(input_h, input_w)} is `x+1` and '
f'out size {(output_h, output_w)} is `nx+1`')
return F.interpolate(input, size, scale_factor, mode, align_corners)

try:
return F.interpolate(input, size, scale_factor, mode, align_corners)
except ValueError:
if isinstance(size, tuple):
if len(size) == 3:
size = size[:2]
return F.interpolate(input, size, scale_factor, mode, align_corners)


def add_prefix(inputs, prefix):


+ 1
- 0
modelscope/models/cv/salient_detection/models/__init__.py View File

@@ -1,3 +1,4 @@
# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License
# source code avaiable via https://github.com/xuebinqin/U-2-Net
from .senet import SENet
from .u2net import U2NET

+ 187
- 0
modelscope/models/cv/salient_detection/models/backbone/Res2Net_v1b.py View File

@@ -0,0 +1,187 @@
# Implementation in this file is modified based on Res2Net-PretrainedModels
# Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
# publicly avaialbe at https://github.com/Res2Net/Res2Net-PretrainedModels/blob/master/res2net_v1b.py
import math

import torch
import torch.nn as nn

__all__ = ['Res2Net', 'res2net50_v1b_26w_4s']


class Bottle2neck(nn.Module):
expansion = 4

def __init__(self,
inplanes,
planes,
stride=1,
downsample=None,
baseWidth=26,
scale=4,
stype='normal'):
""" Constructor
Args:
inplanes: input channel dimensionality
planes: output channel dimensionality
stride: conv stride. Replaces pooling layer.
downsample: None when stride = 1
baseWidth: basic width of conv3x3
scale: number of scale.
type: 'normal': normal set. 'stage': first block of a new stage.
"""
super(Bottle2neck, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = nn.Conv2d(
inplanes, width * scale, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(width * scale)
if scale == 1:
self.nums = 1
else:
self.nums = scale - 1
if stype == 'stage':
self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
convs = []
bns = []
for i in range(self.nums):
convs.append(
nn.Conv2d(
width,
width,
kernel_size=3,
stride=stride,
padding=1,
bias=False))
bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.conv3 = nn.Conv2d(
width * scale, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stype = stype
self.scale = scale
self.width = width

def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0 or self.stype == 'stage':
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
if self.scale != 1 and self.stype == 'normal':
out = torch.cat((out, spx[self.nums]), 1)
elif self.scale != 1 and self.stype == 'stage':
out = torch.cat((out, self.pool(spx[self.nums])), 1)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out


class Res2Net(nn.Module):

def __init__(self, block, layers, baseWidth=26, scale=4, num_classes=1000):
self.inplanes = 64
super(Res2Net, self).__init__()
self.baseWidth = baseWidth
self.scale = scale
self.conv1 = nn.Sequential(
nn.Conv2d(3, 32, 3, 2, 1, bias=False), nn.BatchNorm2d(32),
nn.ReLU(inplace=True), nn.Conv2d(32, 32, 3, 1, 1, bias=False),
nn.BatchNorm2d(32), nn.ReLU(inplace=True),
nn.Conv2d(32, 64, 3, 1, 1, bias=False))
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)

def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.AvgPool2d(
kernel_size=stride,
stride=stride,
ceil_mode=True,
count_include_pad=False),
nn.Conv2d(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=1,
bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(
block(
self.inplanes,
planes,
stride,
downsample=downsample,
stype='stage',
baseWidth=self.baseWidth,
scale=self.scale))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
baseWidth=self.baseWidth,
scale=self.scale))
return nn.Sequential(*layers)

def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x


def res2net50_v1b_26w_4s(backbone_path, pretrained=False, **kwargs):
"""Constructs a Res2Net-50_v1b_26w_4s lib.
Args:
pretrained (bool): If True, returns a lib pre-trained on ImageNet
"""
model = Res2Net(Bottle2neck, [3, 4, 6, 3], baseWidth=26, scale=4, **kwargs)
if pretrained:
model_state = torch.load(backbone_path)
model.load_state_dict(model_state)
return model

+ 6
- 0
modelscope/models/cv/salient_detection/models/backbone/__init__.py View File

@@ -0,0 +1,6 @@
# Implementation in this file is modified based on Res2Net-PretrainedModels
# Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
# publicly avaialbe at https://github.com/Res2Net/Res2Net-PretrainedModels/blob/master/res2net_v1b.py
from .Res2Net_v1b import res2net50_v1b_26w_4s

__all__ = ['res2net50_v1b_26w_4s']

+ 178
- 0
modelscope/models/cv/salient_detection/models/modules.py View File

@@ -0,0 +1,178 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F

from .utils import ConvBNReLU


class AreaLayer(nn.Module):

def __init__(self, in_channel, out_channel):
super(AreaLayer, self).__init__()
self.lbody = nn.Sequential(
nn.Conv2d(out_channel, out_channel, 1),
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True))
self.hbody = nn.Sequential(
nn.Conv2d(in_channel, out_channel, 1), nn.BatchNorm2d(out_channel),
nn.ReLU(inplace=True))
self.body = nn.Sequential(
nn.Conv2d(2 * out_channel, out_channel, 3, 1, 1),
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
nn.Conv2d(out_channel, out_channel, 3, 1, 1),
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
nn.Conv2d(out_channel, 1, 1))

def forward(self, xl, xh):
xl1 = self.lbody(xl)
xl1 = F.interpolate(
xl1, size=xh.size()[2:], mode='bilinear', align_corners=True)
xh1 = self.hbody(xh)
x = torch.cat((xl1, xh1), dim=1)
x_out = self.body(x)
return x_out


class EdgeLayer(nn.Module):

def __init__(self, in_channel, out_channel):
super(EdgeLayer, self).__init__()
self.lbody = nn.Sequential(
nn.Conv2d(out_channel, out_channel, 1),
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True))
self.hbody = nn.Sequential(
nn.Conv2d(in_channel, out_channel, 1), nn.BatchNorm2d(out_channel),
nn.ReLU(inplace=True))
self.bodye = nn.Sequential(
nn.Conv2d(2 * out_channel, out_channel, 3, 1, 1),
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
nn.Conv2d(out_channel, out_channel, 3, 1, 1),
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True),
nn.Conv2d(out_channel, 1, 1))

def forward(self, xl, xh):
xl1 = self.lbody(xl)
xh1 = self.hbody(xh)
xh1 = F.interpolate(
xh1, size=xl.size()[2:], mode='bilinear', align_corners=True)
x = torch.cat((xl1, xh1), dim=1)
x_out = self.bodye(x)
return x_out


class EBlock(nn.Module):

def __init__(self, inchs, outchs):
super(EBlock, self).__init__()
self.elayer = nn.Sequential(
ConvBNReLU(inchs + 1, outchs, kernel_size=3, padding=1, stride=1),
ConvBNReLU(outchs, outchs, 1))
self.salayer = nn.Sequential(
nn.Conv2d(2, 1, 3, 1, 1, bias=False),
nn.BatchNorm2d(1, momentum=0.01), nn.Sigmoid())

def forward(self, x, edgeAtten):
x = torch.cat((x, edgeAtten), dim=1)
ex = self.elayer(x)
ex_max = torch.max(ex, 1, keepdim=True)[0]
ex_mean = torch.mean(ex, dim=1, keepdim=True)
xei_compress = torch.cat((ex_max, ex_mean), dim=1)

scale = self.salayer(xei_compress)
x_out = ex * scale
return x_out


class StructureE(nn.Module):

def __init__(self, inchs, outchs, EM):
super(StructureE, self).__init__()
self.ne_modules = int(inchs / EM)
NM = int(outchs / self.ne_modules)
elayes = []
for i in range(self.ne_modules):
emblock = EBlock(EM, NM)
elayes.append(emblock)
self.emlayes = nn.ModuleList(elayes)
self.body = nn.Sequential(
ConvBNReLU(outchs, outchs, 3, 1, 1), ConvBNReLU(outchs, outchs, 1))

def forward(self, x, edgeAtten):
if edgeAtten.size() != x.size():
edgeAtten = F.interpolate(
edgeAtten, x.size()[2:], mode='bilinear', align_corners=False)
xx = torch.chunk(x, self.ne_modules, dim=1)
efeas = []
for i in range(self.ne_modules):
xei = self.emlayes[i](xx[i], edgeAtten)
efeas.append(xei)
efeas = torch.cat(efeas, dim=1)
x_out = self.body(efeas)
return x_out


class ABlock(nn.Module):

def __init__(self, inchs, outchs, k):
super(ABlock, self).__init__()
self.alayer = nn.Sequential(
ConvBNReLU(inchs, outchs, k, 1, k // 2),
ConvBNReLU(outchs, outchs, 1))
self.arlayer = nn.Sequential(
ConvBNReLU(inchs, outchs, k, 1, k // 2),
ConvBNReLU(outchs, outchs, 1))
self.fusion = ConvBNReLU(2 * outchs, outchs, 1)

def forward(self, x, areaAtten):
xa = x * areaAtten
xra = x * (1 - areaAtten)
xout = self.fusion(torch.cat((xa, xra), dim=1))
return xout


class AMFusion(nn.Module):

def __init__(self, inchs, outchs, AM):
super(AMFusion, self).__init__()
self.k = [3, 3, 5, 5]
self.conv_up = ConvBNReLU(inchs, outchs, 3, 1, 1)
self.up = nn.Upsample(
scale_factor=2, mode='bilinear', align_corners=True)
self.na_modules = int(outchs / AM)
alayers = []
for i in range(self.na_modules):
layer = ABlock(AM, AM, self.k[i])
alayers.append(layer)
self.alayers = nn.ModuleList(alayers)
self.fusion_0 = ConvBNReLU(outchs, outchs, 3, 1, 1)
self.fusion_e = nn.Sequential(
nn.Conv2d(
outchs, outchs, kernel_size=(3, 1), padding=(1, 0),
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True),
nn.Conv2d(
outchs, outchs, kernel_size=(1, 3), padding=(0, 1),
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True))
self.fusion_e1 = nn.Sequential(
nn.Conv2d(
outchs, outchs, kernel_size=(5, 1), padding=(2, 0),
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True),
nn.Conv2d(
outchs, outchs, kernel_size=(1, 5), padding=(0, 2),
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True))
self.fusion = ConvBNReLU(3 * outchs, outchs, 1)

def forward(self, xl, xh, xhm):
xh1 = self.up(self.conv_up(xh))
x = xh1 + xl
xm = self.up(torch.sigmoid(xhm))
xx = torch.chunk(x, self.na_modules, dim=1)
xxmids = []
for i in range(self.na_modules):
xi = self.alayers[i](xx[i], xm)
xxmids.append(xi)
xfea = torch.cat(xxmids, dim=1)
x0 = self.fusion_0(xfea)
x1 = self.fusion_e(xfea)
x2 = self.fusion_e1(xfea)
x_out = self.fusion(torch.cat((x0, x1, x2), dim=1))
return x_out

+ 74
- 0
modelscope/models/cv/salient_detection/models/senet.py View File

@@ -0,0 +1,74 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F

from .backbone import res2net50_v1b_26w_4s as res2net
from .modules import AMFusion, AreaLayer, EdgeLayer, StructureE
from .utils import ASPP, CBAM, ConvBNReLU


class SENet(nn.Module):

def __init__(self, backbone_path=None, pretrained=False):
super(SENet, self).__init__()
resnet50 = res2net(backbone_path, pretrained)
self.layer0_1 = nn.Sequential(resnet50.conv1, resnet50.bn1,
resnet50.relu)
self.maxpool = resnet50.maxpool
self.layer1 = resnet50.layer1
self.layer2 = resnet50.layer2
self.layer3 = resnet50.layer3
self.layer4 = resnet50.layer4
self.aspp3 = ASPP(1024, 256)
self.aspp4 = ASPP(2048, 256)
self.cbblock3 = CBAM(inchs=256, kernel_size=5)
self.cbblock4 = CBAM(inchs=256, kernel_size=5)
self.up = nn.Upsample(
mode='bilinear', scale_factor=2, align_corners=False)
self.conv_up = ConvBNReLU(512, 512, 1)
self.aux_edge = EdgeLayer(512, 256)
self.aux_area = AreaLayer(512, 256)
self.layer1_enhance = StructureE(256, 128, 128)
self.layer2_enhance = StructureE(512, 256, 128)
self.layer3_decoder = AMFusion(512, 256, 128)
self.layer2_decoder = AMFusion(256, 128, 128)
self.out_conv_8 = nn.Conv2d(256, 1, 1)
self.out_conv_4 = nn.Conv2d(128, 1, 1)

def forward(self, x):
layer0 = self.layer0_1(x)
layer0s = self.maxpool(layer0)
layer1 = self.layer1(layer0s)
layer2 = self.layer2(layer1)
layer3 = self.layer3(layer2)
layer4 = self.layer4(layer3)
layer3_eh = self.cbblock3(self.aspp3(layer3))
layer4_eh = self.cbblock4(self.aspp4(layer4))
layer34 = self.conv_up(
torch.cat((self.up(layer4_eh), layer3_eh), dim=1))
edge_atten = self.aux_edge(layer1, layer34)
area_atten = self.aux_area(layer1, layer34)
edge_atten_ = torch.sigmoid(edge_atten)
layer1_eh = self.layer1_enhance(layer1, edge_atten_)
layer2_eh = self.layer2_enhance(layer2, edge_atten_)
layer2_fu = self.layer3_decoder(layer2_eh, layer34, area_atten)
out_8 = self.out_conv_8(layer2_fu)
layer1_fu = self.layer2_decoder(layer1_eh, layer2_fu, out_8)
out_4 = self.out_conv_4(layer1_fu)
out_16 = F.interpolate(
area_atten,
size=x.size()[2:],
mode='bilinear',
align_corners=False)
out_8 = F.interpolate(
out_8, size=x.size()[2:], mode='bilinear', align_corners=False)
out_4 = F.interpolate(
out_4, size=x.size()[2:], mode='bilinear', align_corners=False)
edge_out = F.interpolate(
edge_atten_,
size=x.size()[2:],
mode='bilinear',
align_corners=False)

return out_4.sigmoid(), out_8.sigmoid(), out_16.sigmoid(), edge_out

+ 105
- 0
modelscope/models/cv/salient_detection/models/utils.py View File

@@ -0,0 +1,105 @@
# Implementation in this file is modified based on deeplabv3
# Originally MIT license,publicly avaialbe at https://github.com/fregu856/deeplabv3/blob/master/model/aspp.py
# Implementation in this file is modified based on attention-module
# Originally MIT license,publicly avaialbe at https://github.com/Jongchan/attention-module/blob/master/MODELS/cbam.py
import torch
import torch.nn as nn


class ConvBNReLU(nn.Module):

def __init__(self,
inplanes,
planes,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
bias=False):
super(ConvBNReLU, self).__init__()
self.block = nn.Sequential(
nn.Conv2d(
inplanes,
planes,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias), nn.BatchNorm2d(planes), nn.ReLU(inplace=True))

def forward(self, x):
return self.block(x)


class ASPP(nn.Module):

def __init__(self, in_dim, out_dim):
super(ASPP, self).__init__()
mid_dim = 128
self.conv1 = ConvBNReLU(in_dim, mid_dim, kernel_size=1)
self.conv2 = ConvBNReLU(
in_dim, mid_dim, kernel_size=3, padding=2, dilation=2)
self.conv3 = ConvBNReLU(
in_dim, mid_dim, kernel_size=3, padding=5, dilation=5)
self.conv4 = ConvBNReLU(
in_dim, mid_dim, kernel_size=3, padding=7, dilation=7)
self.conv5 = ConvBNReLU(in_dim, mid_dim, kernel_size=1, padding=0)
self.fuse = ConvBNReLU(5 * mid_dim, out_dim, 3, 1, 1)
self.global_pooling = nn.AdaptiveAvgPool2d(1)

def forward(self, x):
conv1 = self.conv1(x)
conv2 = self.conv2(x)
conv3 = self.conv3(x)
conv4 = self.conv4(x)
xg = self.conv5(self.global_pooling(x))
conv5 = nn.Upsample((x.shape[2], x.shape[3]), mode='nearest')(xg)
return self.fuse(torch.cat((conv1, conv2, conv3, conv4, conv5), 1))


class ChannelAttention(nn.Module):

def __init__(self, inchs, ratio=16):
super(ChannelAttention, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.max_pool = nn.AdaptiveMaxPool2d(1)
self.fc = nn.Sequential(
nn.Conv2d(inchs, inchs // 16, 1, bias=False), nn.ReLU(),
nn.Conv2d(inchs // 16, inchs, 1, bias=False))
self.sigmoid = nn.Sigmoid()

def forward(self, x):
avg_out = self.fc(self.avg_pool(x))
max_out = self.fc(self.max_pool(x))
out = avg_out + max_out
return self.sigmoid(out)


class SpatialAttention(nn.Module):

def __init__(self, kernel_size=7):
super(SpatialAttention, self).__init__()

self.conv1 = nn.Conv2d(
2, 1, kernel_size, padding=kernel_size // 2, bias=False)
self.sigmoid = nn.Sigmoid()

def forward(self, x):
avg_out = torch.mean(x, dim=1, keepdim=True)
max_out, _ = torch.max(x, dim=1, keepdim=True)
x = torch.cat([avg_out, max_out], dim=1)
x = self.conv1(x)
return self.sigmoid(x)


class CBAM(nn.Module):

def __init__(self, inchs, kernel_size=7):
super().__init__()
self.calayer = ChannelAttention(inchs=inchs)
self.saLayer = SpatialAttention(kernel_size=kernel_size)

def forward(self, x):
xca = self.calayer(x) * x
xsa = self.saLayer(xca) * xca
return xsa

+ 18
- 6
modelscope/models/cv/salient_detection/salient_model.py View File

@@ -2,7 +2,6 @@
import os.path as osp

import cv2
import numpy as np
import torch
from PIL import Image
from torchvision import transforms
@@ -10,8 +9,9 @@ from torchvision import transforms
from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from .models import U2NET
from .models import U2NET, SENet


@MODELS.register_module(
@@ -22,13 +22,25 @@ class SalientDetection(TorchModel):
"""str -- model file root."""
super().__init__(model_dir, *args, **kwargs)
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
self.model = U2NET(3, 1)

self.norm_mean = [0.485, 0.456, 0.406]
self.norm_std = [0.229, 0.224, 0.225]
self.norm_size = (320, 320)

config_path = osp.join(model_dir, 'config.py')
if osp.exists(config_path) is False:
self.model = U2NET(3, 1)
else:
self.model = SENet(backbone_path=None, pretrained=False)
config = Config.from_file(config_path)
self.norm_mean = config.norm_mean
self.norm_std = config.norm_std
self.norm_size = config.norm_size
checkpoint = torch.load(model_path, map_location='cpu')
self.transform_input = transforms.Compose([
transforms.Resize((320, 320)),
transforms.Resize(self.norm_size),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transforms.Normalize(mean=self.norm_mean, std=self.norm_std)
])
self.model.load_state_dict(checkpoint)
self.model.eval()


+ 1
- 1
modelscope/models/cv/tinynas_detection/__init__.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

from typing import TYPE_CHECKING



+ 7
- 4
modelscope/models/cv/tinynas_detection/backbone/__init__.py View File

@@ -1,10 +1,11 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import copy

from .darknet import CSPDarknet
from .tinynas import load_tinynas_net
from .tinynas_csp import load_tinynas_net as load_tinynas_net_csp
from .tinynas_res import load_tinynas_net as load_tinynas_net_res


def build_backbone(cfg):
@@ -12,5 +13,7 @@ def build_backbone(cfg):
name = backbone_cfg.pop('name')
if name == 'CSPDarknet':
return CSPDarknet(**backbone_cfg)
elif name == 'TinyNAS':
return load_tinynas_net(backbone_cfg)
elif name == 'TinyNAS_csp':
return load_tinynas_net_csp(backbone_cfg)
elif name == 'TinyNAS_res':
return load_tinynas_net_res(backbone_cfg)

+ 2
- 3
modelscope/models/cv/tinynas_detection/backbone/darknet.py View File

@@ -1,12 +1,11 @@
# Copyright (c) Megvii Inc. All rights reserved.
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import torch
from torch import nn

from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
SPPBottleneck)
from modelscope.models.cv.tinynas_detection.core.base_ops import (
BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck)


class CSPDarknet(nn.Module):


+ 0
- 359
modelscope/models/cv/tinynas_detection/backbone/tinynas.py View File

@@ -1,359 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import torch
import torch.nn as nn

from modelscope.utils.file_utils import read_file
from ..core.base_ops import Focus, SPPBottleneck, get_activation
from ..core.repvgg_block import RepVggBlock


class ConvKXBN(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride):
super(ConvKXBN, self).__init__()
self.conv1 = nn.Conv2d(
in_c,
out_c,
kernel_size,
stride, (kernel_size - 1) // 2,
groups=1,
bias=False)
self.bn1 = nn.BatchNorm2d(out_c)

def forward(self, x):
return self.bn1(self.conv1(x))


class ConvKXBNRELU(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
super(ConvKXBNRELU, self).__init__()
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

def forward(self, x):
output = self.conv(x)
return self.activation_function(output)


class ResConvK1KX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
force_resproj=False,
act='silu',
reparam=False):
super(ResConvK1KX, self).__init__()
self.stride = stride
self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
if not reparam:
self.conv2 = ConvKXBN(btn_c, out_c, 3, stride)
else:
self.conv2 = RepVggBlock(
btn_c, out_c, kernel_size, stride, act='identity')

if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

if stride == 2:
self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
else:
self.residual_downsample = nn.Identity()

if in_c != out_c or force_resproj:
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
else:
self.residual_proj = nn.Identity()

def forward(self, x):
if self.stride != 2:
reslink = self.residual_downsample(x)
reslink = self.residual_proj(reslink)

output = x
output = self.conv1(output)
output = self.activation_function(output)
output = self.conv2(output)
if self.stride != 2:
output = output + reslink
output = self.activation_function(output)

return output


class SuperResConvK1KX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
num_blocks,
with_spp=False,
act='silu',
reparam=False):
super(SuperResConvK1KX, self).__init__()
if act is None:
self.act = torch.relu
else:
self.act = get_activation(act)
self.block_list = nn.ModuleList()
for block_id in range(num_blocks):
if block_id == 0:
in_channels = in_c
out_channels = out_c
this_stride = stride
force_resproj = False # as a part of CSPLayer, DO NOT need this flag
this_kernel_size = kernel_size
else:
in_channels = out_c
out_channels = out_c
this_stride = 1
force_resproj = False
this_kernel_size = kernel_size
the_block = ResConvK1KX(
in_channels,
out_channels,
btn_c,
this_kernel_size,
this_stride,
force_resproj,
act=act,
reparam=reparam)
self.block_list.append(the_block)
if block_id == 0 and with_spp:
self.block_list.append(
SPPBottleneck(out_channels, out_channels))

def forward(self, x):
output = x
for block in self.block_list:
output = block(output)
return output


class ResConvKXKX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
force_resproj=False,
act='silu'):
super(ResConvKXKX, self).__init__()
self.stride = stride
if self.stride == 2:
self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
else:
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
self.conv2 = RepVggBlock(
btn_c, out_c, kernel_size, stride, act='identity')

if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

if stride == 2:
self.residual_downsample = nn.AvgPool2d(
kernel_size=2, stride=2)
else:
self.residual_downsample = nn.Identity()

if in_c != out_c or force_resproj:
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
else:
self.residual_proj = nn.Identity()

def forward(self, x):
if self.stride == 2:
return self.downsampler(x)
reslink = self.residual_downsample(x)
reslink = self.residual_proj(reslink)

output = x
output = self.conv1(output)
output = self.activation_function(output)
output = self.conv2(output)

output = output + reslink
output = self.activation_function(output)

return output


class SuperResConvKXKX(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
num_blocks,
with_spp=False,
act='silu'):
super(SuperResConvKXKX, self).__init__()
if act is None:
self.act = torch.relu
else:
self.act = get_activation(act)
self.block_list = nn.ModuleList()
for block_id in range(num_blocks):
if block_id == 0:
in_channels = in_c
out_channels = out_c
this_stride = stride
force_resproj = False # as a part of CSPLayer, DO NOT need this flag
this_kernel_size = kernel_size
else:
in_channels = out_c
out_channels = out_c
this_stride = 1
force_resproj = False
this_kernel_size = kernel_size
the_block = ResConvKXKX(
in_channels,
out_channels,
btn_c,
this_kernel_size,
this_stride,
force_resproj,
act=act)
self.block_list.append(the_block)
if block_id == 0 and with_spp:
self.block_list.append(
SPPBottleneck(out_channels, out_channels))

def forward(self, x):
output = x
for block in self.block_list:
output = block(output)
return output


class TinyNAS(nn.Module):

def __init__(self,
structure_info=None,
out_indices=[0, 1, 2, 4, 5],
out_channels=[None, None, 128, 256, 512],
with_spp=False,
use_focus=False,
need_conv1=True,
act='silu',
reparam=False):
super(TinyNAS, self).__init__()
assert len(out_indices) == len(out_channels)
self.out_indices = out_indices
self.need_conv1 = need_conv1

self.block_list = nn.ModuleList()
if need_conv1:
self.conv1_list = nn.ModuleList()
for idx, block_info in enumerate(structure_info):
the_block_class = block_info['class']
if the_block_class == 'ConvKXBNRELU':
if use_focus:
the_block = Focus(
block_info['in'],
block_info['out'],
block_info['k'],
act=act)
else:
the_block = ConvKXBNRELU(
block_info['in'],
block_info['out'],
block_info['k'],
block_info['s'],
act=act)
self.block_list.append(the_block)
elif the_block_class == 'SuperResConvK1KX':
spp = with_spp if idx == len(structure_info) - 1 else False
the_block = SuperResConvK1KX(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['k'],
block_info['s'],
block_info['L'],
spp,
act=act,
reparam=reparam)
self.block_list.append(the_block)
elif the_block_class == 'SuperResConvKXKX':
spp = with_spp if idx == len(structure_info) - 1 else False
the_block = SuperResConvKXKX(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['k'],
block_info['s'],
block_info['L'],
spp,
act=act)
self.block_list.append(the_block)
if need_conv1:
if idx in self.out_indices and out_channels[
self.out_indices.index(idx)] is not None:
self.conv1_list.append(
nn.Conv2d(block_info['out'],
out_channels[self.out_indices.index(idx)],
1))
else:
self.conv1_list.append(None)

def init_weights(self, pretrain=None):
pass

def forward(self, x):
output = x
stage_feature_list = []
for idx, block in enumerate(self.block_list):
output = block(output)
if idx in self.out_indices:
if self.need_conv1 and self.conv1_list[idx] is not None:
true_out = self.conv1_list[idx](output)
stage_feature_list.append(true_out)
else:
stage_feature_list.append(output)
return stage_feature_list


def load_tinynas_net(backbone_cfg):
# load masternet model to path
import ast
net_structure_str = read_file(backbone_cfg.structure_file)
struct_str = ''.join([x.strip() for x in net_structure_str])
struct_info = ast.literal_eval(struct_str)
for layer in struct_info:
if 'nbitsA' in layer:
del layer['nbitsA']
if 'nbitsW' in layer:
del layer['nbitsW']

model = TinyNAS(
structure_info=struct_info,
out_indices=backbone_cfg.out_indices,
out_channels=backbone_cfg.out_channels,
with_spp=backbone_cfg.with_spp,
use_focus=backbone_cfg.use_focus,
act=backbone_cfg.act,
need_conv1=backbone_cfg.need_conv1,
reparam=backbone_cfg.reparam)

return model

+ 295
- 0
modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py View File

@@ -0,0 +1,295 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The DAMO-YOLO implementation is also open-sourced by the authors, and available
# at https://github.com/tinyvision/damo-yolo.

import torch
import torch.nn as nn

from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv,
SPPBottleneck,
get_activation)
from modelscope.utils.file_utils import read_file


class ConvKXBN(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride):
super(ConvKXBN, self).__init__()
self.conv1 = nn.Conv2d(
in_c,
out_c,
kernel_size,
stride, (kernel_size - 1) // 2,
groups=1,
bias=False)
self.bn1 = nn.BatchNorm2d(out_c)

def forward(self, x):
return self.bn1(self.conv1(x))


class ConvKXBNRELU(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
super(ConvKXBNRELU, self).__init__()
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

def forward(self, x):
output = self.conv(x)
return self.activation_function(output)


class ResConvBlock(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
act='silu',
reparam=False,
block_type='k1kx'):
super(ResConvBlock, self).__init__()
self.stride = stride
if block_type == 'k1kx':
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size=1, stride=1)
else:
self.conv1 = ConvKXBN(
in_c, btn_c, kernel_size=kernel_size, stride=1)
if not reparam:
self.conv2 = ConvKXBN(btn_c, out_c, kernel_size, stride)
else:
self.conv2 = RepConv(
btn_c, out_c, kernel_size, stride, act='identity')

self.activation_function = get_activation(act)

if in_c != out_c and stride != 2:
self.residual_proj = ConvKXBN(in_c, out_c, kernel_size=1, stride=1)
else:
self.residual_proj = None

def forward(self, x):
if self.residual_proj is not None:
reslink = self.residual_proj(x)
else:
reslink = x
x = self.conv1(x)
x = self.activation_function(x)
x = self.conv2(x)
if self.stride != 2:
x = x + reslink
x = self.activation_function(x)
return x


class CSPStem(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
stride,
kernel_size,
num_blocks,
act='silu',
reparam=False,
block_type='k1kx'):
super(CSPStem, self).__init__()
self.in_channels = in_c
self.out_channels = out_c
self.stride = stride
if self.stride == 2:
self.num_blocks = num_blocks - 1
else:
self.num_blocks = num_blocks
self.kernel_size = kernel_size
self.act = act
self.block_type = block_type
out_c = out_c // 2

if act is None:
self.act = torch.relu
else:
self.act = get_activation(act)
self.block_list = nn.ModuleList()
for block_id in range(self.num_blocks):
if self.stride == 1 and block_id == 0:
in_c = in_c // 2
else:
in_c = out_c
the_block = ResConvBlock(
in_c,
out_c,
btn_c,
kernel_size,
stride=1,
act=act,
reparam=reparam,
block_type=block_type)
self.block_list.append(the_block)

def forward(self, x):
output = x
for block in self.block_list:
output = block(output)
return output


class TinyNAS(nn.Module):

def __init__(self,
structure_info=None,
out_indices=[2, 3, 4],
with_spp=False,
use_focus=False,
act='silu',
reparam=False):
super(TinyNAS, self).__init__()
self.out_indices = out_indices
self.block_list = nn.ModuleList()
self.stride_list = []

for idx, block_info in enumerate(structure_info):
the_block_class = block_info['class']
if the_block_class == 'ConvKXBNRELU':
if use_focus and idx == 0:
the_block = Focus(
block_info['in'],
block_info['out'],
block_info['k'],
act=act)
else:
the_block = ConvKXBNRELU(
block_info['in'],
block_info['out'],
block_info['k'],
block_info['s'],
act=act)
elif the_block_class == 'SuperResConvK1KX':
the_block = CSPStem(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['s'],
block_info['k'],
block_info['L'],
act=act,
reparam=reparam,
block_type='k1kx')
elif the_block_class == 'SuperResConvKXKX':
the_block = CSPStem(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['s'],
block_info['k'],
block_info['L'],
act=act,
reparam=reparam,
block_type='kxkx')
else:
raise NotImplementedError

self.block_list.append(the_block)

self.csp_stage = nn.ModuleList()
self.csp_stage.append(self.block_list[0])
self.csp_stage.append(CSPWrapper(self.block_list[1]))
self.csp_stage.append(CSPWrapper(self.block_list[2]))
self.csp_stage.append(
CSPWrapper((self.block_list[3], self.block_list[4])))
self.csp_stage.append(
CSPWrapper(self.block_list[5], with_spp=with_spp))
del self.block_list

def init_weights(self, pretrain=None):
pass

def forward(self, x):
output = x
stage_feature_list = []
for idx, block in enumerate(self.csp_stage):
output = block(output)
if idx in self.out_indices:
stage_feature_list.append(output)
return stage_feature_list


class CSPWrapper(nn.Module):

def __init__(self, convstem, act='relu', reparam=False, with_spp=False):

super(CSPWrapper, self).__init__()
self.with_spp = with_spp
if isinstance(convstem, tuple):
in_c = convstem[0].in_channels
out_c = convstem[-1].out_channels
hidden_dim = convstem[0].out_channels // 2
_convstem = nn.ModuleList()
for modulelist in convstem:
for layer in modulelist.block_list:
_convstem.append(layer)
else:
in_c = convstem.in_channels
out_c = convstem.out_channels
hidden_dim = out_c // 2
_convstem = convstem.block_list

self.convstem = nn.ModuleList()
for layer in _convstem:
self.convstem.append(layer)

self.act = get_activation(act)
self.downsampler = ConvKXBNRELU(
in_c, hidden_dim * 2, 3, 2, act=self.act)
if self.with_spp:
self.spp = SPPBottleneck(hidden_dim * 2, hidden_dim * 2)
if len(self.convstem) > 0:
self.conv_start = ConvKXBNRELU(
hidden_dim * 2, hidden_dim, 1, 1, act=self.act)
self.conv_shortcut = ConvKXBNRELU(
hidden_dim * 2, out_c // 2, 1, 1, act=self.act)
self.conv_fuse = ConvKXBNRELU(out_c, out_c, 1, 1, act=self.act)

def forward(self, x):
x = self.downsampler(x)
if self.with_spp:
x = self.spp(x)
if len(self.convstem) > 0:
shortcut = self.conv_shortcut(x)
x = self.conv_start(x)
for block in self.convstem:
x = block(x)
x = torch.cat((x, shortcut), dim=1)
x = self.conv_fuse(x)
return x


def load_tinynas_net(backbone_cfg):
# load masternet model to path
import ast

net_structure_str = read_file(backbone_cfg.structure_file)
struct_str = ''.join([x.strip() for x in net_structure_str])
struct_info = ast.literal_eval(struct_str)
for layer in struct_info:
if 'nbitsA' in layer:
del layer['nbitsA']
if 'nbitsW' in layer:
del layer['nbitsW']

model = TinyNAS(
structure_info=struct_info,
out_indices=backbone_cfg.out_indices,
with_spp=backbone_cfg.with_spp,
use_focus=backbone_cfg.use_focus,
act=backbone_cfg.act,
reparam=backbone_cfg.reparam)

return model

+ 238
- 0
modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py View File

@@ -0,0 +1,238 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The DAMO-YOLO implementation is also open-sourced by the authors, and available
# at https://github.com/tinyvision/damo-yolo.

import torch
import torch.nn as nn

from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv,
SPPBottleneck,
get_activation)
from modelscope.utils.file_utils import read_file


class ConvKXBN(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride):
super(ConvKXBN, self).__init__()
self.conv1 = nn.Conv2d(
in_c,
out_c,
kernel_size,
stride, (kernel_size - 1) // 2,
groups=1,
bias=False)
self.bn1 = nn.BatchNorm2d(out_c)

def forward(self, x):
return self.bn1(self.conv1(x))


class ConvKXBNRELU(nn.Module):

def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
super(ConvKXBNRELU, self).__init__()
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
if act is None:
self.activation_function = torch.relu
else:
self.activation_function = get_activation(act)

def forward(self, x):
output = self.conv(x)
return self.activation_function(output)


class ResConvBlock(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
act='silu',
reparam=False,
block_type='k1kx'):
super(ResConvBlock, self).__init__()
self.stride = stride
if block_type == 'k1kx':
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size=1, stride=1)
else:
self.conv1 = ConvKXBN(
in_c, btn_c, kernel_size=kernel_size, stride=1)

if not reparam:
self.conv2 = ConvKXBN(btn_c, out_c, kernel_size, stride)
else:
self.conv2 = RepConv(
btn_c, out_c, kernel_size, stride, act='identity')

self.activation_function = get_activation(act)

if in_c != out_c and stride != 2:
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
else:
self.residual_proj = None

def forward(self, x):
if self.residual_proj is not None:
reslink = self.residual_proj(x)
else:
reslink = x
x = self.conv1(x)
x = self.activation_function(x)
x = self.conv2(x)
if self.stride != 2:
x = x + reslink
x = self.activation_function(x)
return x


class SuperResStem(nn.Module):

def __init__(self,
in_c,
out_c,
btn_c,
kernel_size,
stride,
num_blocks,
with_spp=False,
act='silu',
reparam=False,
block_type='k1kx'):
super(SuperResStem, self).__init__()
if act is None:
self.act = torch.relu
else:
self.act = get_activation(act)
self.block_list = nn.ModuleList()
for block_id in range(num_blocks):
if block_id == 0:
in_channels = in_c
out_channels = out_c
this_stride = stride
this_kernel_size = kernel_size
else:
in_channels = out_c
out_channels = out_c
this_stride = 1
this_kernel_size = kernel_size
the_block = ResConvBlock(
in_channels,
out_channels,
btn_c,
this_kernel_size,
this_stride,
act=act,
reparam=reparam,
block_type=block_type)
self.block_list.append(the_block)
if block_id == 0 and with_spp:
self.block_list.append(
SPPBottleneck(out_channels, out_channels))

def forward(self, x):
output = x
for block in self.block_list:
output = block(output)
return output


class TinyNAS(nn.Module):

def __init__(self,
structure_info=None,
out_indices=[2, 4, 5],
with_spp=False,
use_focus=False,
act='silu',
reparam=False):
super(TinyNAS, self).__init__()
self.out_indices = out_indices
self.block_list = nn.ModuleList()

for idx, block_info in enumerate(structure_info):
the_block_class = block_info['class']
if the_block_class == 'ConvKXBNRELU':
if use_focus:
the_block = Focus(
block_info['in'],
block_info['out'],
block_info['k'],
act=act)
else:
the_block = ConvKXBNRELU(
block_info['in'],
block_info['out'],
block_info['k'],
block_info['s'],
act=act)
self.block_list.append(the_block)
elif the_block_class == 'SuperResConvK1KX':
spp = with_spp if idx == len(structure_info) - 1 else False
the_block = SuperResStem(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['k'],
block_info['s'],
block_info['L'],
spp,
act=act,
reparam=reparam,
block_type='k1kx')
self.block_list.append(the_block)
elif the_block_class == 'SuperResConvKXKX':
spp = with_spp if idx == len(structure_info) - 1 else False
the_block = SuperResStem(
block_info['in'],
block_info['out'],
block_info['btn'],
block_info['k'],
block_info['s'],
block_info['L'],
spp,
act=act,
reparam=reparam,
block_type='kxkx')
self.block_list.append(the_block)
else:
raise NotImplementedError

def init_weights(self, pretrain=None):
pass

def forward(self, x):
output = x
stage_feature_list = []
for idx, block in enumerate(self.block_list):
output = block(output)
if idx in self.out_indices:
stage_feature_list.append(output)
return stage_feature_list


def load_tinynas_net(backbone_cfg):
# load masternet model to path
import ast

net_structure_str = read_file(backbone_cfg.structure_file)
struct_str = ''.join([x.strip() for x in net_structure_str])
struct_info = ast.literal_eval(struct_str)
for layer in struct_info:
if 'nbitsA' in layer:
del layer['nbitsA']
if 'nbitsW' in layer:
del layer['nbitsW']

model = TinyNAS(
structure_info=struct_info,
out_indices=backbone_cfg.out_indices,
with_spp=backbone_cfg.with_spp,
use_focus=backbone_cfg.use_focus,
act=backbone_cfg.act,
reparam=backbone_cfg.reparam)

return model

+ 1
- 1
modelscope/models/cv/tinynas_detection/core/__init__.py View File

@@ -1,2 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

+ 1
- 1
modelscope/models/cv/tinynas_detection/core/base_ops.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
import math

import torch


+ 1
- 1
modelscope/models/cv/tinynas_detection/core/neck_ops.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import numpy as np
import torch


+ 435
- 0
modelscope/models/cv/tinynas_detection/core/ops.py View File

@@ -0,0 +1,435 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


class SiLU(nn.Module):
"""export-friendly version of nn.SiLU()"""

@staticmethod
def forward(x):
return x * torch.sigmoid(x)


class Swish(nn.Module):

def __init__(self, inplace=True):
super(Swish, self).__init__()
self.inplace = inplace

def forward(self, x):
if self.inplace:
x.mul_(F.sigmoid(x))
return x
else:
return x * F.sigmoid(x)


def get_activation(name='silu', inplace=True):
if name is None:
return nn.Identity()

if isinstance(name, str):
if name == 'silu':
module = nn.SiLU(inplace=inplace)
elif name == 'relu':
module = nn.ReLU(inplace=inplace)
elif name == 'lrelu':
module = nn.LeakyReLU(0.1, inplace=inplace)
elif name == 'swish':
module = Swish(inplace=inplace)
elif name == 'hardsigmoid':
module = nn.Hardsigmoid(inplace=inplace)
elif name == 'identity':
module = nn.Identity()
else:
raise AttributeError('Unsupported act type: {}'.format(name))
return module

elif isinstance(name, nn.Module):
return name

else:
raise AttributeError('Unsupported act type: {}'.format(name))


def get_norm(name, out_channels, inplace=True):
if name == 'bn':
module = nn.BatchNorm2d(out_channels)
else:
raise NotImplementedError
return module


class ConvBNAct(nn.Module):
"""A Conv2d -> Batchnorm -> silu/leaky relu block"""

def __init__(
self,
in_channels,
out_channels,
ksize,
stride=1,
groups=1,
bias=False,
act='silu',
norm='bn',
reparam=False,
):
super().__init__()
# same padding
pad = (ksize - 1) // 2
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=pad,
groups=groups,
bias=bias,
)
if norm is not None:
self.bn = get_norm(norm, out_channels, inplace=True)
if act is not None:
self.act = get_activation(act, inplace=True)
self.with_norm = norm is not None
self.with_act = act is not None

def forward(self, x):
x = self.conv(x)
if self.with_norm:
x = self.bn(x)
if self.with_act:
x = self.act(x)
return x

def fuseforward(self, x):
return self.act(self.conv(x))


class SPPBottleneck(nn.Module):
"""Spatial pyramid pooling layer used in YOLOv3-SPP"""

def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
activation='silu'):
super().__init__()
hidden_channels = in_channels // 2
self.conv1 = ConvBNAct(
in_channels, hidden_channels, 1, stride=1, act=activation)
self.m = nn.ModuleList([
nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = ConvBNAct(
conv2_channels, out_channels, 1, stride=1, act=activation)

def forward(self, x):
x = self.conv1(x)
x = torch.cat([x] + [m(x) for m in self.m], dim=1)
x = self.conv2(x)
return x


class Focus(nn.Module):
"""Focus width and height information into channel space."""

def __init__(self,
in_channels,
out_channels,
ksize=1,
stride=1,
act='silu'):
super().__init__()
self.conv = ConvBNAct(
in_channels * 4, out_channels, ksize, stride, act=act)

def forward(self, x):
# shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
patch_top_left = x[..., ::2, ::2]
patch_top_right = x[..., ::2, 1::2]
patch_bot_left = x[..., 1::2, ::2]
patch_bot_right = x[..., 1::2, 1::2]
x = torch.cat(
(
patch_top_left,
patch_bot_left,
patch_top_right,
patch_bot_right,
),
dim=1,
)
return self.conv(x)


class BasicBlock_3x3_Reverse(nn.Module):

def __init__(self,
ch_in,
ch_hidden_ratio,
ch_out,
act='relu',
shortcut=True):
super(BasicBlock_3x3_Reverse, self).__init__()
assert ch_in == ch_out
ch_hidden = int(ch_in * ch_hidden_ratio)
self.conv1 = ConvBNAct(ch_hidden, ch_out, 3, stride=1, act=act)
self.conv2 = RepConv(ch_in, ch_hidden, 3, stride=1, act=act)
self.shortcut = shortcut

def forward(self, x):
y = self.conv2(x)
y = self.conv1(y)
if self.shortcut:
return x + y
else:
return y


class SPP(nn.Module):

def __init__(
self,
ch_in,
ch_out,
k,
pool_size,
act='swish',
):
super(SPP, self).__init__()
self.pool = []
for i, size in enumerate(pool_size):
pool = nn.MaxPool2d(
kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
self.add_module('pool{}'.format(i), pool)
self.pool.append(pool)
self.conv = ConvBNAct(ch_in, ch_out, k, act=act)

def forward(self, x):
outs = [x]

for pool in self.pool:
outs.append(pool(x))
y = torch.cat(outs, axis=1)

y = self.conv(y)
return y


class CSPStage(nn.Module):

def __init__(self,
block_fn,
ch_in,
ch_hidden_ratio,
ch_out,
n,
act='swish',
spp=False):
super(CSPStage, self).__init__()

split_ratio = 2
ch_first = int(ch_out // split_ratio)
ch_mid = int(ch_out - ch_first)
self.conv1 = ConvBNAct(ch_in, ch_first, 1, act=act)
self.conv2 = ConvBNAct(ch_in, ch_mid, 1, act=act)
self.convs = nn.Sequential()

next_ch_in = ch_mid
for i in range(n):
if block_fn == 'BasicBlock_3x3_Reverse':
self.convs.add_module(
str(i),
BasicBlock_3x3_Reverse(
next_ch_in,
ch_hidden_ratio,
ch_mid,
act=act,
shortcut=True))
else:
raise NotImplementedError
if i == (n - 1) // 2 and spp:
self.convs.add_module(
'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
next_ch_in = ch_mid
self.conv3 = ConvBNAct(ch_mid * n + ch_first, ch_out, 1, act=act)

def forward(self, x):
y1 = self.conv1(x)
y2 = self.conv2(x)

mid_out = [y1]
for conv in self.convs:
y2 = conv(y2)
mid_out.append(y2)
y = torch.cat(mid_out, axis=1)
y = self.conv3(y)
return y


def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
'''Basic cell for rep-style block, including conv and bn'''
result = nn.Sequential()
result.add_module(
'conv',
nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False))
result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
return result


class RepConv(nn.Module):
'''RepConv is a basic rep-style block, including training and deploy status
Code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
'''

def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
dilation=1,
groups=1,
padding_mode='zeros',
deploy=False,
act='relu',
norm=None):
super(RepConv, self).__init__()
self.deploy = deploy
self.groups = groups
self.in_channels = in_channels
self.out_channels = out_channels

assert kernel_size == 3
assert padding == 1

padding_11 = padding - kernel_size // 2

if isinstance(act, str):
self.nonlinearity = get_activation(act)
else:
self.nonlinearity = act

if deploy:
self.rbr_reparam = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=True,
padding_mode=padding_mode)

else:
self.rbr_identity = None
self.rbr_dense = conv_bn(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups)
self.rbr_1x1 = conv_bn(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
stride=stride,
padding=padding_11,
groups=groups)

def forward(self, inputs):
'''Forward process'''
if hasattr(self, 'rbr_reparam'):
return self.nonlinearity(self.rbr_reparam(inputs))

if self.rbr_identity is None:
id_out = 0
else:
id_out = self.rbr_identity(inputs)

return self.nonlinearity(
self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)

def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
if isinstance(branch, nn.Sequential):
kernel = branch.conv.weight
running_mean = branch.bn.running_mean
running_var = branch.bn.running_var
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn.eps
else:
assert isinstance(branch, nn.BatchNorm2d)
if not hasattr(self, 'id_tensor'):
input_dim = self.in_channels // self.groups
kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
dtype=np.float32)
for i in range(self.in_channels):
kernel_value[i, i % input_dim, 1, 1] = 1
self.id_tensor = torch.from_numpy(kernel_value).to(
branch.weight.device)
kernel = self.id_tensor
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std

def switch_to_deploy(self):
if hasattr(self, 'rbr_reparam'):
return
kernel, bias = self.get_equivalent_kernel_bias()
self.rbr_reparam = nn.Conv2d(
in_channels=self.rbr_dense.conv.in_channels,
out_channels=self.rbr_dense.conv.out_channels,
kernel_size=self.rbr_dense.conv.kernel_size,
stride=self.rbr_dense.conv.stride,
padding=self.rbr_dense.conv.padding,
dilation=self.rbr_dense.conv.dilation,
groups=self.rbr_dense.conv.groups,
bias=True)
self.rbr_reparam.weight.data = kernel
self.rbr_reparam.bias.data = bias
for para in self.parameters():
para.detach_()
self.__delattr__('rbr_dense')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity'):
self.__delattr__('rbr_identity')
if hasattr(self, 'id_tensor'):
self.__delattr__('id_tensor')
self.deploy = True

+ 1
- 1
modelscope/models/cv/tinynas_detection/core/repvgg_block.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import numpy as np
import torch


+ 1
- 1
modelscope/models/cv/tinynas_detection/core/utils.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import numpy as np
import torch


+ 2
- 2
modelscope/models/cv/tinynas_detection/detector.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import os.path as osp
import pickle
@@ -42,7 +42,7 @@ class SingleStageDetector(TorchModel):
self.conf_thre = config.model.head.nms_conf_thre
self.nms_thre = config.model.head.nms_iou_thre

if self.cfg.model.backbone.name == 'TinyNAS':
if 'TinyNAS' in self.cfg.model.backbone.name:
self.cfg.model.backbone.structure_file = osp.join(
model_dir, self.cfg.model.backbone.structure_file)
self.backbone = build_backbone(self.cfg.model.backbone)


+ 4
- 1
modelscope/models/cv/tinynas_detection/head/__init__.py View File

@@ -1,9 +1,10 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import copy

from .gfocal_v2_tiny import GFocalHead_Tiny
from .zero_head import ZeroHead


def build_head(cfg):
@@ -12,5 +13,7 @@ def build_head(cfg):
name = head_cfg.pop('name')
if name == 'GFocalV2':
return GFocalHead_Tiny(**head_cfg)
elif name == 'ZeroHead':
return ZeroHead(**head_cfg)
else:
raise NotImplementedError

+ 3
- 2
modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import functools
from functools import partial
@@ -9,7 +9,8 @@ import torch
import torch.nn as nn
import torch.nn.functional as F

from ..core.base_ops import BaseConv, DWConv
from modelscope.models.cv.tinynas_detection.core.base_ops import (BaseConv,
DWConv)


class Scale(nn.Module):


+ 288
- 0
modelscope/models/cv/tinynas_detection/head/zero_head.py View File

@@ -0,0 +1,288 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The DAMO-YOLO implementation is also open-sourced by the authors, and available
# at https://github.com/tinyvision/damo-yolo.
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F

from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct


class Scale(nn.Module):

def __init__(self, scale=1.0):
super(Scale, self).__init__()
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))

def forward(self, x):
return x * self.scale


def multi_apply(func, *args, **kwargs):

pfunc = partial(func, **kwargs) if kwargs else func
map_results = map(pfunc, *args)
return tuple(map(list, zip(*map_results)))


def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
"""
x1 = points[..., 0] - distance[..., 0]
y1 = points[..., 1] - distance[..., 1]
x2 = points[..., 0] + distance[..., 2]
y2 = points[..., 1] + distance[..., 3]
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1])
y1 = y1.clamp(min=0, max=max_shape[0])
x2 = x2.clamp(min=0, max=max_shape[1])
y2 = y2.clamp(min=0, max=max_shape[0])
return torch.stack([x1, y1, x2, y2], -1)


def bbox2distance(points, bbox, max_dis=None, eps=0.1):
"""Decode bounding box based on distances.
"""
left = points[:, 0] - bbox[:, 0]
top = points[:, 1] - bbox[:, 1]
right = bbox[:, 2] - points[:, 0]
bottom = bbox[:, 3] - points[:, 1]
if max_dis is not None:
left = left.clamp(min=0, max=max_dis - eps)
top = top.clamp(min=0, max=max_dis - eps)
right = right.clamp(min=0, max=max_dis - eps)
bottom = bottom.clamp(min=0, max=max_dis - eps)
return torch.stack([left, top, right, bottom], -1)


class Integral(nn.Module):
"""A fixed layer for calculating integral result from distribution.
"""

def __init__(self, reg_max=16):
super(Integral, self).__init__()
self.reg_max = reg_max
self.register_buffer('project',
torch.linspace(0, self.reg_max, self.reg_max + 1))

def forward(self, x):
"""Forward feature from the regression head to get integral result of
bounding box location.
"""
b, hw, _, _ = x.size()
x = x.reshape(b * hw * 4, self.reg_max + 1)
y = self.project.type_as(x).unsqueeze(1)
x = torch.matmul(x, y).reshape(b, hw, 4)
return x


class ZeroHead(nn.Module):
"""Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
Estimation for Dense Object Detection.
"""

def __init__(
self,
num_classes,
in_channels,
stacked_convs=4, # 4
feat_channels=256,
reg_max=12,
strides=[8, 16, 32],
norm='gn',
act='relu',
nms_conf_thre=0.05,
nms_iou_thre=0.7,
nms=True,
**kwargs):
self.in_channels = in_channels
self.num_classes = num_classes
self.stacked_convs = stacked_convs
self.act = act
self.strides = strides
if stacked_convs == 0:
feat_channels = in_channels
if isinstance(feat_channels, list):
self.feat_channels = feat_channels
else:
self.feat_channels = [feat_channels] * len(self.strides)
# add 1 for keep consistance with former models
self.cls_out_channels = num_classes + 1
self.reg_max = reg_max

self.nms = nms
self.nms_conf_thre = nms_conf_thre
self.nms_iou_thre = nms_iou_thre

self.feat_size = [torch.zeros(4) for _ in strides]

super(ZeroHead, self).__init__()
self.integral = Integral(self.reg_max)

self._init_layers()

def _build_not_shared_convs(self, in_channel, feat_channels):
cls_convs = nn.ModuleList()
reg_convs = nn.ModuleList()

for i in range(self.stacked_convs):
chn = feat_channels if i > 0 else in_channel
kernel_size = 3 if i > 0 else 1
cls_convs.append(
ConvBNAct(
chn,
feat_channels,
kernel_size,
stride=1,
groups=1,
norm='bn',
act=self.act))
reg_convs.append(
ConvBNAct(
chn,
feat_channels,
kernel_size,
stride=1,
groups=1,
norm='bn',
act=self.act))

return cls_convs, reg_convs

def _init_layers(self):
"""Initialize layers of the head."""
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()

for i in range(len(self.strides)):
cls_convs, reg_convs = self._build_not_shared_convs(
self.in_channels[i], self.feat_channels[i])
self.cls_convs.append(cls_convs)
self.reg_convs.append(reg_convs)

self.gfl_cls = nn.ModuleList([
nn.Conv2d(
self.feat_channels[i], self.cls_out_channels, 3, padding=1)
for i in range(len(self.strides))
])

self.gfl_reg = nn.ModuleList([
nn.Conv2d(
self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
for i in range(len(self.strides))
])

self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])

def forward(self, xin, labels=None, imgs=None, aux_targets=None):
if self.training:
return NotImplementedError
else:
return self.forward_eval(xin=xin, labels=labels, imgs=imgs)

def forward_eval(self, xin, labels=None, imgs=None):

# prepare priors for label assignment and bbox decode
if self.feat_size[0] != xin[0].shape:
mlvl_priors_list = [
self.get_single_level_center_priors(
xin[i].shape[0],
xin[i].shape[-2:],
stride,
dtype=torch.float32,
device=xin[0].device)
for i, stride in enumerate(self.strides)
]
self.mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
self.feat_size[0] = xin[0].shape

# forward for bboxes and classification prediction
cls_scores, bbox_preds = multi_apply(
self.forward_single,
xin,
self.cls_convs,
self.reg_convs,
self.gfl_cls,
self.gfl_reg,
self.scales,
)
cls_scores = torch.cat(cls_scores, dim=1)[:, :, :self.num_classes]
bbox_preds = torch.cat(bbox_preds, dim=1)
# batch bbox decode
bbox_preds = self.integral(bbox_preds) * self.mlvl_priors[..., 2, None]
bbox_preds = distance2bbox(self.mlvl_priors[..., :2], bbox_preds)

res = torch.cat([bbox_preds, cls_scores[..., 0:self.num_classes]],
dim=-1)
return res

def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, scale):
"""Forward feature of a single scale level.

"""
cls_feat = x
reg_feat = x

for cls_conv, reg_conv in zip(cls_convs, reg_convs):
cls_feat = cls_conv(cls_feat)
reg_feat = reg_conv(reg_feat)

bbox_pred = scale(gfl_reg(reg_feat)).float()
N, C, H, W = bbox_pred.size()
if self.training:
bbox_before_softmax = bbox_pred.reshape(N, 4, self.reg_max + 1, H,
W)
bbox_before_softmax = bbox_before_softmax.flatten(
start_dim=3).permute(0, 3, 1, 2)
bbox_pred = F.softmax(
bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)

cls_score = gfl_cls(cls_feat).sigmoid()

cls_score = cls_score.flatten(start_dim=2).permute(
0, 2, 1) # N, h*w, self.num_classes+1
bbox_pred = bbox_pred.flatten(start_dim=3).permute(
0, 3, 1, 2) # N, h*w, 4, self.reg_max+1
if self.training:
return cls_score, bbox_pred, bbox_before_softmax
else:
return cls_score, bbox_pred

def get_single_level_center_priors(self, batch_size, featmap_size, stride,
dtype, device):

h, w = featmap_size
x_range = (torch.arange(0, int(w), dtype=dtype,
device=device)) * stride
y_range = (torch.arange(0, int(h), dtype=dtype,
device=device)) * stride

x = x_range.repeat(h, 1)
y = y_range.unsqueeze(-1).repeat(1, w)

y = y.flatten()
x = x.flatten()
strides = x.new_full((x.shape[0], ), stride)
priors = torch.stack([x, y, strides, strides], dim=-1)

return priors.unsqueeze(0).repeat(batch_size, 1, 1)

def sample(self, assign_result, gt_bboxes):
pos_inds = torch.nonzero(
assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
neg_inds = torch.nonzero(
assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

if gt_bboxes.numel() == 0:
# hack for index error case
assert pos_assigned_gt_inds.numel() == 0
pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
else:
if len(gt_bboxes.shape) < 2:
gt_bboxes = gt_bboxes.view(-1, 4)
pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]

return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds

+ 2
- 2
modelscope/models/cv/tinynas_detection/neck/__init__.py View File

@@ -1,10 +1,10 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import copy

from .giraffe_fpn import GiraffeNeck
from .giraffe_fpn_v2 import GiraffeNeckV2
from .giraffe_fpn_btn import GiraffeNeckV2


def build_neck(cfg):


+ 1
- 1
modelscope/models/cv/tinynas_detection/neck/giraffe_config.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import collections
import itertools


+ 3
- 2
modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import logging
import math
@@ -15,7 +15,8 @@ from timm import create_model
from timm.models.layers import (Swish, create_conv2d, create_pool2d,
get_act_layer)

from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
from modelscope.models.cv.tinynas_detection.core.base_ops import (
CSPLayer, ShuffleBlock, ShuffleCSPLayer)
from .giraffe_config import get_graph_config

_ACT_LAYER = Swish


+ 132
- 0
modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py View File

@@ -0,0 +1,132 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

import torch
import torch.nn as nn

from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct, CSPStage


class GiraffeNeckV2(nn.Module):

def __init__(
self,
depth=1.0,
hidden_ratio=1.0,
in_features=[2, 3, 4],
in_channels=[256, 512, 1024],
out_channels=[256, 512, 1024],
act='silu',
spp=False,
block_name='BasicBlock',
):
super().__init__()
self.in_features = in_features
self.in_channels = in_channels
self.out_channels = out_channels
Conv = ConvBNAct

self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

# node x3: input x0, x1
self.bu_conv13 = Conv(in_channels[1], in_channels[1], 3, 2, act=act)
self.merge_3 = CSPStage(
block_name,
in_channels[1] + in_channels[2],
hidden_ratio,
in_channels[2],
round(3 * depth),
act=act,
spp=spp)

# node x4: input x1, x2, x3
self.bu_conv24 = Conv(in_channels[0], in_channels[0], 3, 2, act=act)
self.merge_4 = CSPStage(
block_name,
in_channels[0] + in_channels[1] + in_channels[2],
hidden_ratio,
in_channels[1],
round(3 * depth),
act=act,
spp=spp)

# node x5: input x2, x4
self.merge_5 = CSPStage(
block_name,
in_channels[1] + in_channels[0],
hidden_ratio,
out_channels[0],
round(3 * depth),
act=act,
spp=spp)

# node x7: input x4, x5
self.bu_conv57 = Conv(out_channels[0], out_channels[0], 3, 2, act=act)
self.merge_7 = CSPStage(
block_name,
out_channels[0] + in_channels[1],
hidden_ratio,
out_channels[1],
round(3 * depth),
act=act,
spp=spp)

# node x6: input x3, x4, x7
self.bu_conv46 = Conv(in_channels[1], in_channels[1], 3, 2, act=act)
self.bu_conv76 = Conv(out_channels[1], out_channels[1], 3, 2, act=act)
self.merge_6 = CSPStage(
block_name,
in_channels[1] + out_channels[1] + in_channels[2],
hidden_ratio,
out_channels[2],
round(3 * depth),
act=act,
spp=spp)

def init_weights(self):
pass

def forward(self, out_features):
"""
Args:
inputs: input images.

Returns:
Tuple[Tensor]: FPN feature.
"""

# backbone
[x2, x1, x0] = out_features

# node x3
x13 = self.bu_conv13(x1)
x3 = torch.cat([x0, x13], 1)
x3 = self.merge_3(x3)

# node x4
x34 = self.upsample(x3)
x24 = self.bu_conv24(x2)
x4 = torch.cat([x1, x24, x34], 1)
x4 = self.merge_4(x4)

# node x5
x45 = self.upsample(x4)
x5 = torch.cat([x2, x45], 1)
x5 = self.merge_5(x5)

# node x8
# x8 = x5

# node x7
x57 = self.bu_conv57(x5)
x7 = torch.cat([x4, x57], 1)
x7 = self.merge_7(x7)

# node x6
x46 = self.bu_conv46(x4)
x76 = self.bu_conv76(x7)
x6 = torch.cat([x3, x46, x76], 1)
x6 = self.merge_6(x6)

outputs = (x5, x7, x6)
return outputs

+ 0
- 200
modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py View File

@@ -1,200 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

import torch
import torch.nn as nn

from ..core.base_ops import BaseConv, CSPLayer, DWConv
from ..core.neck_ops import CSPStage


class GiraffeNeckV2(nn.Module):

def __init__(
self,
depth=1.0,
width=1.0,
in_channels=[256, 512, 1024],
out_channels=[256, 512, 1024],
depthwise=False,
act='silu',
spp=True,
reparam_mode=True,
block_name='BasicBlock',
):
super().__init__()
self.in_channels = in_channels
Conv = DWConv if depthwise else BaseConv

reparam_mode = reparam_mode

self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

# node x3: input x0, x1
self.bu_conv13 = Conv(
int(in_channels[1] * width),
int(in_channels[1] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_3 = CSPStage(
block_name,
int((in_channels[1] + in_channels[2]) * width),
int(in_channels[2] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_3 = CSPLayer(
int((in_channels[1] + in_channels[2]) * width),
int(in_channels[2] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x4: input x1, x2, x3
self.bu_conv24 = Conv(
int(in_channels[0] * width),
int(in_channels[0] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_4 = CSPStage(
block_name,
int((in_channels[0] + in_channels[1] + in_channels[2])
* width),
int(in_channels[1] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_4 = CSPLayer(
int((in_channels[0] + in_channels[1] + in_channels[2])
* width),
int(in_channels[1] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x5: input x2, x4
if reparam_mode:
self.merge_5 = CSPStage(
block_name,
int((in_channels[1] + in_channels[0]) * width),
int(out_channels[0] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_5 = CSPLayer(
int((in_channels[1] + in_channels[0]) * width),
int(out_channels[0] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x7: input x4, x5
self.bu_conv57 = Conv(
int(out_channels[0] * width),
int(out_channels[0] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_7 = CSPStage(
block_name,
int((out_channels[0] + in_channels[1]) * width),
int(out_channels[1] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_7 = CSPLayer(
int((out_channels[0] + in_channels[1]) * width),
int(out_channels[1] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

# node x6: input x3, x4, x7
self.bu_conv46 = Conv(
int(in_channels[1] * width),
int(in_channels[1] * width),
3,
2,
act=act)
self.bu_conv76 = Conv(
int(out_channels[1] * width),
int(out_channels[1] * width),
3,
2,
act=act)
if reparam_mode:
self.merge_6 = CSPStage(
block_name,
int((in_channels[1] + out_channels[1] + in_channels[2])
* width),
int(out_channels[2] * width),
round(3 * depth),
act=act,
spp=spp)
else:
self.merge_6 = CSPLayer(
int((in_channels[1] + out_channels[1] + in_channels[2])
* width),
int(out_channels[2] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act)

def init_weights(self):
pass

def forward(self, out_features):
"""
Args:
inputs: input images.

Returns:
Tuple[Tensor]: FPN feature.
"""

# backbone
[x2, x1, x0] = out_features

# node x3
x13 = self.bu_conv13(x1)
x3 = torch.cat([x0, x13], 1)
x3 = self.merge_3(x3)

# node x4
x34 = self.upsample(x3)
x24 = self.bu_conv24(x2)
x4 = torch.cat([x1, x24, x34], 1)
x4 = self.merge_4(x4)

# node x5
x45 = self.upsample(x4)
x5 = torch.cat([x2, x45], 1)
x5 = self.merge_5(x5)

# node x7
x57 = self.bu_conv57(x5)
x7 = torch.cat([x4, x57], 1)
x7 = self.merge_7(x7)

# node x6
x46 = self.bu_conv46(x4)
x76 = self.bu_conv76(x7)
x6 = torch.cat([x3, x46, x76], 1)
x6 = self.merge_6(x6)

outputs = (x5, x7, x6)
return outputs

+ 1
- 1
modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py View File

@@ -11,5 +11,5 @@ from .detector import SingleStageDetector
class DamoYolo(SingleStageDetector):

def __init__(self, model_dir, *args, **kwargs):
self.config_name = 'damoyolo_s.py'
self.config_name = 'damoyolo.py'
super(DamoYolo, self).__init__(model_dir, *args, **kwargs)

+ 1
- 1
modelscope/models/cv/tinynas_detection/tinynas_detector.py View File

@@ -1,5 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.

from modelscope.metainfo import Models
from modelscope.models.builder import MODELS


+ 23
- 20
modelscope/models/cv/tinynas_detection/utils.py View File

@@ -1,30 +1,33 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
# The DAMO-YOLO implementation is also open-sourced by the authors, and available
# at https://github.com/tinyvision/damo-yolo.

import importlib
import os
import shutil
import sys
import tempfile
from os.path import dirname, join

from easydict import EasyDict

def get_config_by_file(config_file):
try:
sys.path.append(os.path.dirname(config_file))
current_config = importlib.import_module(
os.path.basename(config_file).split('.')[0])
exp = current_config.Config()
except Exception:
raise ImportError(
"{} doesn't contains class named 'Config'".format(config_file))
return exp

def parse_config(filename):
filename = str(filename)
if filename.endswith('.py'):
with tempfile.TemporaryDirectory() as temp_config_dir:
shutil.copyfile(filename, join(temp_config_dir, '_tempconfig.py'))
sys.path.insert(0, temp_config_dir)
mod = importlib.import_module('_tempconfig')
sys.path.pop(0)
cfg_dict = EasyDict({
name: value
for name, value in mod.__dict__.items()
if not name.startswith('__')
})
# delete imported module
del sys.modules['_tempconfig']
else:
raise IOError('Only .py type are supported now!')

def parse_config(config_file):
"""
get config object by file.
Args:
config_file (str): file path of config.
"""
assert (config_file is not None), 'plz provide config file'
if config_file is not None:
return get_config_by_file(config_file)
return cfg_dict

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save