@@ -7,7 +7,7 @@ gpus='0,1 2,3 4,5 6,7' | |||
cpu_sets='45-58 31-44 16-30 0-15' | |||
cpu_sets_arr=($cpu_sets) | |||
is_get_file_lock=false | |||
CI_COMMAND='bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml' | |||
CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh python tests/run.py --parallel 2 --run_config tests/run_config.yaml} | |||
echo "ci command: $CI_COMMAND" | |||
idx=0 | |||
for gpu in $gpus | |||
@@ -1,3 +1,18 @@ | |||
<div align="center"> | |||
[](https://pypi.org/project/modelscope/) | |||
<!-- [](https://easy-cv.readthedocs.io/en/latest/) --> | |||
[](https://github.com/modelscope/modelscope/blob/master/LICENSE) | |||
[](https://github.com/modelscope/modelscope/issues) | |||
[](https://GitHub.com/modelscope/modelscope/pull/) | |||
[](https://GitHub.com/modelscope/modelscope/commit/) | |||
<!-- [](https://GitHub.com/modelscope/modelscope/graphs/contributors/) --> | |||
<!-- [](http://makeapullrequest.com) --> | |||
</div> | |||
# Introduction | |||
[ModelScope]( https://www.modelscope.cn) is a “Model-as-a-Service” (MaaS) platform that seeks to bring together most advanced machine learning models from the AI community, and to streamline the process of leveraging AI models in real applications. The core ModelScope library enables developers to perform inference, training and evaluation, through rich layers of API designs that facilitate a unified experience across state-of-the-art models from different AI domains. | |||
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:8ce83bf2a8e6056aba3b3cdc92d2e04d23bdf15a2c1fde814cb091444d59a10b | |||
size 3180872 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:46dbc998c9d1d48111267c40741dd3200f2e5bcf4075f8c4c97f4451160dce50 | |||
size 134570 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:a8cf9fc5abc119f5b5e246143206c22f488c63e86e47f762585b9edd84e081ad | |||
size 618160 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:2bc50ef70bbbc46132710b69efa683cf0bf64aeb0990bb3ff411930831bbc17d | |||
size 619034 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:0b2882d3bcd9e8f8f9531ac34ac09c0208d86500b910d3e1ca34c022caa9be62 | |||
size 155874 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:4c713215f7fb4da5382c9137347ee52956a7a44d5979c4cffd3c9b6d1d7e878f | |||
size 19445 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:3b230497f6ca10be42aed92b86db435d74fd7306746a059b4ad1e0d6b0652806 | |||
size 35694 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:209f6ba7f15c9c34a02801b4c6ef33a979f3086702b5229d2e7975eb403c3e15 | |||
size 45819 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:e37106cf024efd1886b870fa45f69905fcea202db8a848debc4ccd359ea3b21c | |||
size 116248 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:700f7cb3c958fb710d6b863b3c9aa0549f6ab837dfbe3382f8f750f73cec46e3 | |||
size 116868 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:f4b7e23f02a35136707ac7862e0a8468797f239e89497351847cfacb2a9c24f6 | |||
size 202112 |
@@ -1,3 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030 | |||
size 63349 | |||
oid sha256:dc16ad72e753f751360dab82878ec0a31190fb5125632d8f4698f6537fae79cb | |||
size 40819 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:8e4ade7a6b119e20e82a641246199b4b530759166acc1f813d7cefee65b3e1e0 | |||
size 63944943 |
@@ -104,9 +104,9 @@ git lfs install | |||
``` | |||
for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0) | |||
and then execute | |||
```bash | |||
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm | |||
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm | |||
sudo rpm -ivh your_rpm_file_name.rpm | |||
git lfs install | |||
``` | |||
@@ -7,7 +7,8 @@ from torch.utils.data.dataloader import default_collate | |||
from modelscope.exporters.builder import EXPORTERS | |||
from modelscope.exporters.torch_model_exporter import TorchModelExporter | |||
from modelscope.metainfo import Models | |||
from modelscope.preprocessors import Preprocessor, build_preprocessor | |||
from modelscope.preprocessors import ( | |||
TextClassificationTransformersPreprocessor, build_preprocessor) | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModeKeys, Tasks | |||
@@ -59,12 +60,13 @@ class SbertForSequenceClassificationExporter(TorchModelExporter): | |||
'mode': ModeKeys.TRAIN, | |||
**sequence_length | |||
}) | |||
preprocessor: Preprocessor = build_preprocessor(cfg, field_name) | |||
preprocessor: TextClassificationTransformersPreprocessor = build_preprocessor( | |||
cfg, field_name) | |||
if pair: | |||
first_sequence = preprocessor.tokenizer.unk_token | |||
second_sequence = preprocessor.tokenizer.unk_token | |||
first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token | |||
second_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token | |||
else: | |||
first_sequence = preprocessor.tokenizer.unk_token | |||
first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token | |||
second_sequence = None | |||
batched = [] | |||
@@ -17,7 +17,7 @@ from modelscope.utils.regress_test_utils import (compare_arguments_nested, | |||
numpify_tensor_nested) | |||
from .base import Exporter | |||
logger = get_logger(__name__) | |||
logger = get_logger() | |||
class TorchModelExporter(Exporter): | |||
@@ -2,6 +2,7 @@ | |||
# yapf: disable | |||
import datetime | |||
import functools | |||
import os | |||
import pickle | |||
import platform | |||
@@ -14,10 +15,12 @@ from http.cookiejar import CookieJar | |||
from os.path import expanduser | |||
from typing import Dict, List, Optional, Tuple, Union | |||
import requests | |||
from requests import Session | |||
from requests.adapters import HTTPAdapter, Retry | |||
from modelscope import __version__ | |||
from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, | |||
from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT, | |||
API_RESPONSE_FIELD_DATA, | |||
API_RESPONSE_FIELD_EMAIL, | |||
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN, | |||
API_RESPONSE_FIELD_MESSAGE, | |||
@@ -25,7 +28,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA, | |||
DEFAULT_CREDENTIALS_PATH, | |||
MODELSCOPE_CLOUD_ENVIRONMENT, | |||
MODELSCOPE_CLOUD_USERNAME, | |||
ONE_YEAR_SECONDS, Licenses, | |||
ONE_YEAR_SECONDS, | |||
REQUESTS_API_HTTP_METHOD, Licenses, | |||
ModelVisibility) | |||
from modelscope.hub.errors import (InvalidParameter, NotExistError, | |||
NotLoginException, NoValidRevisionError, | |||
@@ -54,6 +58,17 @@ class HubApi: | |||
def __init__(self, endpoint=None): | |||
self.endpoint = endpoint if endpoint is not None else get_endpoint() | |||
self.headers = {'user-agent': ModelScopeConfig.get_user_agent()} | |||
self.session = Session() | |||
retry = Retry(total=2, read=2, connect=2, backoff_factor=1, | |||
status_forcelist=(500, 502, 503, 504),) | |||
adapter = HTTPAdapter(max_retries=retry) | |||
self.session.mount('http://', adapter) | |||
self.session.mount('https://', adapter) | |||
# set http timeout | |||
for method in REQUESTS_API_HTTP_METHOD: | |||
setattr(self.session, | |||
method, | |||
functools.partial(getattr(self.session, method), timeout=API_HTTP_CLIENT_TIMEOUT)) | |||
def login( | |||
self, | |||
@@ -73,7 +88,7 @@ class HubApi: | |||
</Tip> | |||
""" | |||
path = f'{self.endpoint}/api/v1/login' | |||
r = requests.post( | |||
r = self.session.post( | |||
path, json={'AccessToken': access_token}, headers=self.headers) | |||
raise_for_http_status(r) | |||
d = r.json() | |||
@@ -129,7 +144,7 @@ class HubApi: | |||
'Visibility': visibility, # server check | |||
'License': license | |||
} | |||
r = requests.post( | |||
r = self.session.post( | |||
path, json=body, cookies=cookies, headers=self.headers) | |||
handle_http_post_error(r, path, body) | |||
raise_on_error(r.json()) | |||
@@ -150,7 +165,7 @@ class HubApi: | |||
raise ValueError('Token does not exist, please login first.') | |||
path = f'{self.endpoint}/api/v1/models/{model_id}' | |||
r = requests.delete(path, cookies=cookies, headers=self.headers) | |||
r = self.session.delete(path, cookies=cookies, headers=self.headers) | |||
raise_for_http_status(r) | |||
raise_on_error(r.json()) | |||
@@ -183,7 +198,7 @@ class HubApi: | |||
else: | |||
path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}' | |||
r = requests.get(path, cookies=cookies, headers=self.headers) | |||
r = self.session.get(path, cookies=cookies, headers=self.headers) | |||
handle_http_response(r, logger, cookies, model_id) | |||
if r.status_code == HTTPStatus.OK: | |||
if is_ok(r.json()): | |||
@@ -311,7 +326,7 @@ class HubApi: | |||
""" | |||
cookies = ModelScopeConfig.get_cookies() | |||
path = f'{self.endpoint}/api/v1/models/' | |||
r = requests.put( | |||
r = self.session.put( | |||
path, | |||
data='{"Path":"%s", "PageNumber":%s, "PageSize": %s}' % | |||
(owner_or_group, page_number, page_size), | |||
@@ -360,7 +375,7 @@ class HubApi: | |||
if cutoff_timestamp is None: | |||
cutoff_timestamp = get_release_datetime() | |||
path = f'{self.endpoint}/api/v1/models/{model_id}/revisions?EndTime=%s' % cutoff_timestamp | |||
r = requests.get(path, cookies=cookies, headers=self.headers) | |||
r = self.session.get(path, cookies=cookies, headers=self.headers) | |||
handle_http_response(r, logger, cookies, model_id) | |||
d = r.json() | |||
raise_on_error(d) | |||
@@ -422,7 +437,7 @@ class HubApi: | |||
cookies = self._check_cookie(use_cookies) | |||
path = f'{self.endpoint}/api/v1/models/{model_id}/revisions' | |||
r = requests.get(path, cookies=cookies, headers=self.headers) | |||
r = self.session.get(path, cookies=cookies, headers=self.headers) | |||
handle_http_response(r, logger, cookies, model_id) | |||
d = r.json() | |||
raise_on_error(d) | |||
@@ -467,7 +482,7 @@ class HubApi: | |||
if root is not None: | |||
path = path + f'&Root={root}' | |||
r = requests.get( | |||
r = self.session.get( | |||
path, cookies=cookies, headers={ | |||
**headers, | |||
**self.headers | |||
@@ -488,7 +503,7 @@ class HubApi: | |||
def list_datasets(self): | |||
path = f'{self.endpoint}/api/v1/datasets' | |||
params = {} | |||
r = requests.get(path, params=params, headers=self.headers) | |||
r = self.session.get(path, params=params, headers=self.headers) | |||
raise_for_http_status(r) | |||
dataset_list = r.json()[API_RESPONSE_FIELD_DATA] | |||
return [x['Name'] for x in dataset_list] | |||
@@ -514,13 +529,13 @@ class HubApi: | |||
os.makedirs(cache_dir, exist_ok=True) | |||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' | |||
cookies = ModelScopeConfig.get_cookies() | |||
r = requests.get(datahub_url, cookies=cookies) | |||
r = self.session.get(datahub_url, cookies=cookies) | |||
resp = r.json() | |||
datahub_raise_on_error(datahub_url, resp) | |||
dataset_id = resp['Data']['Id'] | |||
dataset_type = resp['Data']['Type'] | |||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' | |||
r = requests.get(datahub_url, cookies=cookies, headers=self.headers) | |||
r = self.session.get(datahub_url, cookies=cookies, headers=self.headers) | |||
resp = r.json() | |||
datahub_raise_on_error(datahub_url, resp) | |||
file_list = resp['Data'] | |||
@@ -539,7 +554,7 @@ class HubApi: | |||
if extension in dataset_meta_format: | |||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ | |||
f'Revision={revision}&FilePath={file_path}' | |||
r = requests.get(datahub_url, cookies=cookies) | |||
r = self.session.get(datahub_url, cookies=cookies) | |||
raise_for_http_status(r) | |||
local_path = os.path.join(cache_dir, file_path) | |||
if os.path.exists(local_path): | |||
@@ -584,7 +599,7 @@ class HubApi: | |||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ | |||
f'ststoken?Revision={revision}' | |||
r = requests.get(url=datahub_url, cookies=cookies, headers=self.headers) | |||
r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers) | |||
resp = r.json() | |||
raise_on_error(resp) | |||
return resp['Data'] | |||
@@ -595,7 +610,7 @@ class HubApi: | |||
f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' | |||
cookies = ModelScopeConfig.get_cookies() | |||
resp = requests.get(url=url, cookies=cookies) | |||
resp = self.session.get(url=url, cookies=cookies) | |||
resp = resp.json() | |||
raise_on_error(resp) | |||
resp = resp['Data'] | |||
@@ -604,7 +619,7 @@ class HubApi: | |||
def on_dataset_download(self, dataset_name: str, namespace: str) -> None: | |||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' | |||
cookies = ModelScopeConfig.get_cookies() | |||
r = requests.post(url, cookies=cookies, headers=self.headers) | |||
r = self.session.post(url, cookies=cookies, headers=self.headers) | |||
raise_for_http_status(r) | |||
def delete_oss_dataset_object(self, object_name: str, dataset_name: str, | |||
@@ -615,7 +630,7 @@ class HubApi: | |||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}' | |||
cookies = self.check_local_cookies(use_cookies=True) | |||
resp = requests.delete(url=url, cookies=cookies) | |||
resp = self.session.delete(url=url, cookies=cookies) | |||
resp = resp.json() | |||
raise_on_error(resp) | |||
resp = resp['Message'] | |||
@@ -630,16 +645,15 @@ class HubApi: | |||
f'&Revision={revision}' | |||
cookies = self.check_local_cookies(use_cookies=True) | |||
resp = requests.delete(url=url, cookies=cookies) | |||
resp = self.session.delete(url=url, cookies=cookies) | |||
resp = resp.json() | |||
raise_on_error(resp) | |||
resp = resp['Message'] | |||
return resp | |||
@staticmethod | |||
def datahub_remote_call(url): | |||
def datahub_remote_call(self, url): | |||
cookies = ModelScopeConfig.get_cookies() | |||
r = requests.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()}) | |||
r = self.session.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()}) | |||
resp = r.json() | |||
datahub_raise_on_error(url, resp) | |||
return resp['Data'] | |||
@@ -661,7 +675,7 @@ class HubApi: | |||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}' | |||
cookies = ModelScopeConfig.get_cookies() | |||
r = requests.post(url, cookies=cookies, headers=self.headers) | |||
r = self.session.post(url, cookies=cookies, headers=self.headers) | |||
resp = r.json() | |||
raise_on_error(resp) | |||
return resp['Message'] | |||
@@ -11,7 +11,12 @@ MODEL_ID_SEPARATOR = '/' | |||
FILE_HASH = 'Sha256' | |||
LOGGER_NAME = 'ModelScopeHub' | |||
DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials') | |||
REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete'] | |||
API_HTTP_CLIENT_TIMEOUT = 60 | |||
API_RESPONSE_FIELD_DATA = 'Data' | |||
API_FILE_DOWNLOAD_RETRY_TIMES = 5 | |||
API_FILE_DOWNLOAD_TIMEOUT = 60 * 5 | |||
API_FILE_DOWNLOAD_CHUNK_SIZE = 4096 | |||
API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken' | |||
API_RESPONSE_FIELD_USERNAME = 'Username' | |||
API_RESPONSE_FIELD_EMAIL = 'Email' | |||
@@ -9,13 +9,16 @@ from pathlib import Path | |||
from typing import Dict, Optional, Union | |||
import requests | |||
from requests.adapters import Retry | |||
from tqdm import tqdm | |||
from modelscope import __version__ | |||
from modelscope.hub.api import HubApi, ModelScopeConfig | |||
from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE, | |||
API_FILE_DOWNLOAD_RETRY_TIMES, | |||
API_FILE_DOWNLOAD_TIMEOUT, FILE_HASH) | |||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION | |||
from modelscope.utils.logger import get_logger | |||
from .constants import FILE_HASH | |||
from .errors import FileDownloadError, NotExistError | |||
from .utils.caching import ModelFileSystemCache | |||
from .utils.utils import (file_integrity_validation, get_cache_dir, | |||
@@ -184,10 +187,7 @@ def http_get_file( | |||
headers: Optional[Dict[str, str]] = None, | |||
): | |||
""" | |||
Download remote file. Do not gobble up errors. | |||
This method is only used by snapshot_download, since the behavior is quite different with single file download | |||
TODO: consolidate with http_get_file() to avoild duplicate code | |||
Download remote file, will retry 5 times before giving up on errors. | |||
Args: | |||
url(`str`): | |||
actual download url of the file | |||
@@ -204,30 +204,46 @@ def http_get_file( | |||
total = -1 | |||
temp_file_manager = partial( | |||
tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False) | |||
get_headers = {} if headers is None else copy.deepcopy(headers) | |||
with temp_file_manager() as temp_file: | |||
logger.info('downloading %s to %s', url, temp_file.name) | |||
headers = copy.deepcopy(headers) | |||
r = requests.get(url, stream=True, headers=headers, cookies=cookies) | |||
r.raise_for_status() | |||
content_length = r.headers.get('Content-Length') | |||
total = int(content_length) if content_length is not None else None | |||
progress = tqdm( | |||
unit='B', | |||
unit_scale=True, | |||
unit_divisor=1024, | |||
total=total, | |||
initial=0, | |||
desc='Downloading', | |||
) | |||
for chunk in r.iter_content(chunk_size=1024): | |||
if chunk: # filter out keep-alive new chunks | |||
progress.update(len(chunk)) | |||
temp_file.write(chunk) | |||
progress.close() | |||
# retry sleep 0.5s, 1s, 2s, 4s | |||
retry = Retry( | |||
total=API_FILE_DOWNLOAD_RETRY_TIMES, | |||
backoff_factor=1, | |||
allowed_methods=['GET']) | |||
while True: | |||
try: | |||
downloaded_size = temp_file.tell() | |||
get_headers['Range'] = 'bytes=%d-' % downloaded_size | |||
r = requests.get( | |||
url, | |||
stream=True, | |||
headers=get_headers, | |||
cookies=cookies, | |||
timeout=API_FILE_DOWNLOAD_TIMEOUT) | |||
r.raise_for_status() | |||
content_length = r.headers.get('Content-Length') | |||
total = int( | |||
content_length) if content_length is not None else None | |||
progress = tqdm( | |||
unit='B', | |||
unit_scale=True, | |||
unit_divisor=1024, | |||
total=total, | |||
initial=downloaded_size, | |||
desc='Downloading', | |||
) | |||
for chunk in r.iter_content( | |||
chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE): | |||
if chunk: # filter out keep-alive new chunks | |||
progress.update(len(chunk)) | |||
temp_file.write(chunk) | |||
progress.close() | |||
break | |||
except (Exception) as e: # no matter what happen, we will retry. | |||
retry = retry.increment('GET', url, error=e) | |||
retry.sleep() | |||
logger.info('storing %s in cache at %s', url, local_dir) | |||
downloaded_length = os.path.getsize(temp_file.name) | |||
@@ -94,7 +94,7 @@ class GitCommandWrapper(metaclass=Singleton): | |||
return False | |||
def git_lfs_install(self, repo_dir): | |||
cmd = ['git', '-C', repo_dir, 'lfs', 'install'] | |||
cmd = ['-C', repo_dir, 'lfs', 'install'] | |||
try: | |||
self._run_git_command(*cmd) | |||
return True | |||
@@ -36,14 +36,20 @@ class Models(object): | |||
swinL_semantic_segmentation = 'swinL-semantic-segmentation' | |||
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' | |||
text_driven_segmentation = 'text-driven-segmentation' | |||
newcrfs_depth_estimation = 'newcrfs-depth-estimation' | |||
resnet50_bert = 'resnet50-bert' | |||
referring_video_object_segmentation = 'swinT-referring-video-object-segmentation' | |||
fer = 'fer' | |||
fairface = 'fairface' | |||
retinaface = 'retinaface' | |||
shop_segmentation = 'shop-segmentation' | |||
mogface = 'mogface' | |||
mtcnn = 'mtcnn' | |||
ulfd = 'ulfd' | |||
arcface = 'arcface' | |||
facemask = 'facemask' | |||
flc = 'flc' | |||
tinymog = 'tinymog' | |||
video_inpainting = 'video-inpainting' | |||
human_wholebody_keypoint = 'human-wholebody-keypoint' | |||
hand_static = 'hand-static' | |||
@@ -51,6 +57,7 @@ class Models(object): | |||
face_emotion = 'face-emotion' | |||
product_segmentation = 'product-segmentation' | |||
image_body_reshaping = 'image-body-reshaping' | |||
video_human_matting = 'video-human-matting' | |||
# EasyCV models | |||
yolox = 'YOLOX' | |||
@@ -71,6 +78,7 @@ class Models(object): | |||
space_T_en = 'space-T-en' | |||
space_T_cn = 'space-T-cn' | |||
tcrf = 'transformer-crf' | |||
token_classification_for_ner = 'token-classification-for-ner' | |||
tcrf_wseg = 'transformer-crf-for-word-segmentation' | |||
transformer_softmax = 'transformer-softmax' | |||
lcrf = 'lstm-crf' | |||
@@ -78,14 +86,17 @@ class Models(object): | |||
gcnncrf = 'gcnn-crf' | |||
bart = 'bart' | |||
gpt3 = 'gpt3' | |||
gpt_moe = 'gpt-moe' | |||
gpt_neo = 'gpt-neo' | |||
plug = 'plug' | |||
bert_for_ds = 'bert-for-document-segmentation' | |||
ponet_for_ds = 'ponet-for-document-segmentation' | |||
ponet = 'ponet' | |||
T5 = 'T5' | |||
mglm = 'mglm' | |||
codegeex = 'codegeex' | |||
bloom = 'bloom' | |||
unite = 'unite' | |||
# audio models | |||
sambert_hifigan = 'sambert-hifigan' | |||
@@ -152,6 +163,8 @@ class Pipelines(object): | |||
image_denoise = 'nafnet-image-denoise' | |||
person_image_cartoon = 'unet-person-image-cartoon' | |||
ocr_detection = 'resnet18-ocr-detection' | |||
table_recognition = 'dla34-table-recognition' | |||
license_plate_detection = 'resnet18-license-plate-detection' | |||
action_recognition = 'TAdaConv_action-recognition' | |||
animal_recognition = 'resnet101-animal-recognition' | |||
general_recognition = 'resnet101-general-recognition' | |||
@@ -166,17 +179,23 @@ class Pipelines(object): | |||
easycv_segmentation = 'easycv-segmentation' | |||
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment' | |||
salient_detection = 'u2net-salient-detection' | |||
salient_boudary_detection = 'res2net-salient-detection' | |||
camouflaged_detection = 'res2net-camouflaged-detection' | |||
image_classification = 'image-classification' | |||
face_detection = 'resnet-face-detection-scrfd10gkps' | |||
card_detection = 'resnet-card-detection-scrfd34gkps' | |||
ulfd_face_detection = 'manual-face-detection-ulfd' | |||
tinymog_face_detection = 'manual-face-detection-tinymog' | |||
facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' | |||
facial_landmark_confidence = 'manual-facial-landmark-confidence-flcm' | |||
face_attribute_recognition = 'resnet34-face-attribute-recognition-fairface' | |||
retina_face_detection = 'resnet50-face-detection-retinaface' | |||
mog_face_detection = 'resnet101-face-detection-cvpr22papermogface' | |||
mtcnn_face_detection = 'manual-face-detection-mtcnn' | |||
live_category = 'live-category' | |||
general_image_classification = 'vit-base_image-classification_ImageNet-labels' | |||
daily_image_classification = 'vit-base_image-classification_Dailylife-labels' | |||
nextvit_small_daily_image_classification = 'nextvit-small_image-classification_Dailylife-labels' | |||
image_color_enhance = 'csrnet-image-color-enhance' | |||
virtual_try_on = 'virtual-try-on' | |||
image_colorization = 'unet-image-colorization' | |||
@@ -187,6 +206,8 @@ class Pipelines(object): | |||
realtime_object_detection = 'cspnet_realtime-object-detection_yolox' | |||
realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo' | |||
face_recognition = 'ir101-face-recognition-cfglint' | |||
arc_face_recognition = 'ir50-face-recognition-arcface' | |||
mask_face_recognition = 'resnet-face-recognition-facemask' | |||
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' | |||
image2image_translation = 'image-to-image-translation' | |||
live_category = 'live-category' | |||
@@ -205,6 +226,7 @@ class Pipelines(object): | |||
video_summarization = 'googlenet_pgl_video_summarization' | |||
language_guided_video_summarization = 'clip-it-video-summarization' | |||
image_semantic_segmentation = 'image-semantic-segmentation' | |||
image_depth_estimation = 'image-depth-estimation' | |||
image_reid_person = 'passvitb-image-reid-person' | |||
image_inpainting = 'fft-inpainting' | |||
text_driven_segmentation = 'text-driven-segmentation' | |||
@@ -219,6 +241,7 @@ class Pipelines(object): | |||
product_segmentation = 'product-segmentation' | |||
image_body_reshaping = 'flow-based-body-reshaping' | |||
referring_video_object_segmentation = 'referring-video-object-segmentation' | |||
video_human_matting = 'video-human-matting' | |||
# nlp tasks | |||
automatic_post_editing = 'automatic-post-editing' | |||
@@ -248,6 +271,7 @@ class Pipelines(object): | |||
text_error_correction = 'text-error-correction' | |||
plug_generation = 'plug-generation' | |||
gpt3_generation = 'gpt3-generation' | |||
gpt_moe_generation = 'gpt-moe-generation' | |||
faq_question_answering = 'faq-question-answering' | |||
conversational_text_to_sql = 'conversational-text-to-sql' | |||
table_question_answering_pipeline = 'table-question-answering-pipeline' | |||
@@ -255,6 +279,7 @@ class Pipelines(object): | |||
text_ranking = 'text-ranking' | |||
relation_extraction = 'relation-extraction' | |||
document_segmentation = 'document-segmentation' | |||
extractive_summarization = 'extractive-summarization' | |||
feature_extraction = 'feature-extraction' | |||
mglm_text_summarization = 'mglm-text-summarization' | |||
codegeex_code_translation = 'codegeex-code-translation' | |||
@@ -263,6 +288,7 @@ class Pipelines(object): | |||
translation_en_to_ro = 'translation_en_to_ro' # keep it underscore | |||
translation_en_to_fr = 'translation_en_to_fr' # keep it underscore | |||
token_classification = 'token-classification' | |||
translation_evaluation = 'translation-evaluation' | |||
# audio tasks | |||
sambert_hifigan_tts = 'sambert-hifigan-tts' | |||
@@ -285,6 +311,7 @@ class Pipelines(object): | |||
video_multi_modal_embedding = 'video-multi-modal-embedding' | |||
image_text_retrieval = 'image-text-retrieval' | |||
ofa_ocr_recognition = 'ofa-ocr-recognition' | |||
ofa_asr = 'ofa-asr' | |||
# science tasks | |||
protein_structure = 'unifold-protein-structure' | |||
@@ -318,6 +345,7 @@ class Trainers(object): | |||
image_inpainting = 'image-inpainting' | |||
referring_video_object_segmentation = 'referring-video-object-segmentation' | |||
image_classification_team = 'image-classification-team' | |||
image_classification = 'image-classification' | |||
# nlp trainers | |||
bert_sentiment_analysis = 'bert-sentiment-analysis' | |||
@@ -327,6 +355,8 @@ class Trainers(object): | |||
nlp_veco_trainer = 'nlp-veco-trainer' | |||
nlp_text_ranking_trainer = 'nlp-text-ranking-trainer' | |||
text_generation_trainer = 'text-generation-trainer' | |||
nlp_plug_trainer = 'nlp-plug-trainer' | |||
gpt3_trainer = 'nlp-gpt3-trainer' | |||
# audio trainers | |||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' | |||
@@ -352,6 +382,7 @@ class Preprocessors(object): | |||
image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor' | |||
video_summarization_preprocessor = 'video-summarization-preprocessor' | |||
movie_scene_segmentation_preprocessor = 'movie-scene-segmentation-preprocessor' | |||
image_classification_bypass_preprocessor = 'image-classification-bypass-preprocessor' | |||
# nlp preprocessor | |||
sen_sim_tokenizer = 'sen-sim-tokenizer' | |||
@@ -388,6 +419,7 @@ class Preprocessors(object): | |||
feature_extraction = 'feature-extraction' | |||
mglm_summarization = 'mglm-summarization' | |||
sentence_piece = 'sentence-piece' | |||
translation_evaluation = 'translation-evaluation-preprocessor' | |||
# audio preprocessor | |||
linear_aec_fbank = 'linear-aec-fbank' | |||
@@ -489,6 +521,10 @@ class Hooks(object): | |||
# CLIP logit_scale clamp | |||
ClipClampLogitScaleHook = 'ClipClampLogitScaleHook' | |||
# train | |||
EarlyStopHook = 'EarlyStopHook' | |||
DeepspeedHook = 'DeepspeedHook' | |||
class LR_Schedulers(object): | |||
"""learning rate scheduler is defined here | |||
@@ -19,18 +19,27 @@ from .builder import METRICS, MetricKeys | |||
class SequenceClassificationMetric(Metric): | |||
"""The metric computation class for sequence classification tasks. | |||
This metric class calculates accuracy of the whole input batches. | |||
This metric class calculates accuracy/F1 of all the input batches. | |||
Args: | |||
label_name: The key of label column in the 'inputs' arg. | |||
logit_name: The key of logits column in the 'inputs' arg. | |||
""" | |||
def __init__(self, *args, **kwargs): | |||
def __init__(self, | |||
label_name=OutputKeys.LABELS, | |||
logit_name=OutputKeys.LOGITS, | |||
*args, | |||
**kwargs): | |||
super().__init__(*args, **kwargs) | |||
self.preds = [] | |||
self.labels = [] | |||
self.label_name = label_name | |||
self.logit_name = logit_name | |||
def add(self, outputs: Dict, inputs: Dict): | |||
label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS | |||
ground_truths = inputs[label_name] | |||
eval_results = outputs[OutputKeys.LOGITS] | |||
ground_truths = inputs[self.label_name] | |||
eval_results = outputs[self.logit_name] | |||
self.preds.append( | |||
torch_nested_numpify(torch_nested_detach(eval_results))) | |||
self.labels.append( | |||
@@ -18,16 +18,22 @@ class TextGenerationMetric(Metric): | |||
"""The metric computation class for text generation classes. | |||
This metric class calculates F1 of the rouge scores for the whole evaluation dataset. | |||
Args: | |||
target_text: The key of the target text column in the `inputs` arg. | |||
pred_text: The key of the predicted text column in the `outputs` arg. | |||
""" | |||
def __init__(self): | |||
def __init__(self, target_text='tgts', pred_text='preds'): | |||
self.preds: List[str] = [] | |||
self.tgts: List[str] = [] | |||
self.rouge = Rouge() | |||
self.target_text = target_text | |||
self.pred_text = pred_text | |||
def add(self, outputs: Dict[str, List[str]], inputs: Dict[str, List[str]]): | |||
ground_truths = inputs['tgts'] | |||
eval_results = outputs['preds'] | |||
ground_truths = inputs[self.target_text] | |||
eval_results = outputs[self.pred_text] | |||
for truth in ground_truths: | |||
self.tgts.append(rebuild_chinese_str(truth)) | |||
for result in eval_results: | |||
@@ -38,7 +44,7 @@ class TextGenerationMetric(Metric): | |||
def remove_useless(string: str) -> str: | |||
return string.replace(' ', '').replace('.', '') | |||
return remove_useless(pred) and remove_useless(tgt) | |||
return len(remove_useless(pred)) != 0 and len(remove_useless(tgt)) != 0 | |||
def evaluate(self): | |||
assert self.preds, 'preds in TextGenerationMetric must not be empty!' | |||
@@ -21,20 +21,16 @@ class TokenClassificationMetric(Metric): | |||
This metric class uses seqeval to calculate the scores. | |||
Args: | |||
return_entity_level_metrics (bool, *optional*): | |||
label_name(str, `optional`): The key of label column in the 'inputs' arg. | |||
logit_name(str, `optional`): The key of logits column in the 'inputs' arg. | |||
return_entity_level_metrics (bool, `optional`): | |||
Whether to return every label's detail metrics, default False. | |||
label2id(dict, `optional`): The label2id information to get the token labels. | |||
""" | |||
def add(self, outputs: Dict, inputs: Dict): | |||
label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS | |||
ground_truths = inputs[label_name] | |||
eval_results = outputs[OutputKeys.LOGITS] | |||
self.preds.append( | |||
torch_nested_numpify(torch_nested_detach(eval_results))) | |||
self.labels.append( | |||
torch_nested_numpify(torch_nested_detach(ground_truths))) | |||
def __init__(self, | |||
label_name=OutputKeys.LABELS, | |||
logit_name=OutputKeys.LOGITS, | |||
return_entity_level_metrics=False, | |||
label2id=None, | |||
*args, | |||
@@ -44,6 +40,16 @@ class TokenClassificationMetric(Metric): | |||
self.preds = [] | |||
self.labels = [] | |||
self.label2id = label2id | |||
self.label_name = label_name | |||
self.logit_name = logit_name | |||
def add(self, outputs: Dict, inputs: Dict): | |||
ground_truths = inputs[self.label_name] | |||
eval_results = outputs[self.logit_name] | |||
self.preds.append( | |||
torch_nested_numpify(torch_nested_detach(eval_results))) | |||
self.labels.append( | |||
torch_nested_numpify(torch_nested_detach(ground_truths))) | |||
def evaluate(self): | |||
label2id = self.label2id | |||
@@ -5,10 +5,11 @@ from abc import ABC, abstractmethod | |||
from typing import Any, Callable, Dict, List, Optional, Union | |||
from modelscope.hub.snapshot_download import snapshot_download | |||
from modelscope.models.builder import MODELS, build_model | |||
from modelscope.utils.checkpoint import save_checkpoint, save_pretrained | |||
from modelscope.models.builder import build_model | |||
from modelscope.utils.checkpoint import (save_checkpoint, save_configuration, | |||
save_pretrained) | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks | |||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke, ModelFile | |||
from modelscope.utils.device import verify_device | |||
from modelscope.utils.logger import get_logger | |||
@@ -94,6 +95,10 @@ class Model(ABC): | |||
if prefetched is not None: | |||
kwargs.pop('model_prefetched') | |||
invoked_by = kwargs.get(Invoke.KEY) | |||
if invoked_by is not None: | |||
kwargs.pop(Invoke.KEY) | |||
if osp.exists(model_name_or_path): | |||
local_model_dir = model_name_or_path | |||
else: | |||
@@ -101,7 +106,13 @@ class Model(ABC): | |||
raise RuntimeError( | |||
'Expecting model is pre-fetched locally, but is not found.' | |||
) | |||
local_model_dir = snapshot_download(model_name_or_path, revision) | |||
if invoked_by is not None: | |||
invoked_by = '%s/%s' % (Invoke.KEY, invoked_by) | |||
else: | |||
invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PRETRAINED) | |||
local_model_dir = snapshot_download( | |||
model_name_or_path, revision, user_agent=invoked_by) | |||
logger.info(f'initialize model from {local_model_dir}') | |||
if cfg_dict is not None: | |||
cfg = cfg_dict | |||
@@ -119,11 +130,9 @@ class Model(ABC): | |||
model_cfg[k] = v | |||
if device is not None: | |||
model_cfg.device = device | |||
model = build_model( | |||
model_cfg, task_name=task_name, default_args=kwargs) | |||
model = build_model(model_cfg, task_name=task_name) | |||
else: | |||
model = build_model( | |||
model_cfg, task_name=task_name, default_args=kwargs) | |||
model = build_model(model_cfg, task_name=task_name) | |||
# dynamically add pipeline info to model for pipeline inference | |||
if hasattr(cfg, 'pipeline'): | |||
@@ -132,7 +141,9 @@ class Model(ABC): | |||
if not hasattr(model, 'cfg'): | |||
model.cfg = cfg | |||
model_cfg.pop('model_dir', None) | |||
model.name = model_name_or_path | |||
model.model_dir = local_model_dir | |||
return model | |||
def save_pretrained(self, | |||
@@ -140,6 +151,7 @@ class Model(ABC): | |||
save_checkpoint_names: Union[str, List[str]] = None, | |||
save_function: Callable = save_checkpoint, | |||
config: Optional[dict] = None, | |||
save_config_function: Callable = save_configuration, | |||
**kwargs): | |||
"""save the pretrained model, its configuration and other related files to a directory, | |||
so that it can be re-loaded | |||
@@ -157,18 +169,15 @@ class Model(ABC): | |||
config (Optional[dict], optional): | |||
The config for the configuration.json, might not be identical with model.config | |||
save_config_function (Callble, optional): | |||
The function to use to save the configuration. | |||
""" | |||
if config is None and hasattr(self, 'cfg'): | |||
config = self.cfg | |||
assert config is not None, 'Cannot save the model because the model config is empty.' | |||
if isinstance(config, Config): | |||
config = config.to_dict() | |||
if 'preprocessor' in config and config['preprocessor'] is not None: | |||
if 'mode' in config['preprocessor']: | |||
config['preprocessor']['mode'] = 'inference' | |||
elif 'val' in config['preprocessor'] and 'mode' in config[ | |||
'preprocessor']['val']: | |||
config['preprocessor']['val']['mode'] = 'inference' | |||
if config is not None: | |||
save_config_function(target_folder, config) | |||
save_pretrained(self, target_folder, save_checkpoint_names, | |||
save_function, config, **kwargs) | |||
save_function, **kwargs) |
@@ -6,7 +6,7 @@ import torch | |||
from modelscope.models.base.base_head import Head | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger(__name__) | |||
logger = get_logger() | |||
class TorchHead(Head, torch.nn.Module): | |||
@@ -6,10 +6,11 @@ import torch | |||
from torch import nn | |||
from modelscope.utils.file_utils import func_receive_dict_inputs | |||
from modelscope.utils.hub import parse_label_mapping | |||
from modelscope.utils.logger import get_logger | |||
from .base_model import Model | |||
logger = get_logger(__name__) | |||
logger = get_logger() | |||
class TorchModel(Model, torch.nn.Module): | |||
@@ -5,11 +5,14 @@ import os.path as osp | |||
import shutil | |||
import subprocess | |||
import uuid | |||
from tempfile import TemporaryDirectory | |||
from urllib.parse import urlparse | |||
import cv2 | |||
import numpy as np | |||
import onnxruntime as rt | |||
from modelscope.hub.file_download import http_get_file | |||
from modelscope.models import Model | |||
from modelscope.utils.constant import Devices | |||
from modelscope.utils.device import verify_device | |||
@@ -22,8 +25,9 @@ class ActionDetONNX(Model): | |||
model_file = osp.join(config['model_file']) | |||
device_type, device_id = verify_device(self._device_name) | |||
options = rt.SessionOptions() | |||
options.intra_op_num_threads = 1 | |||
options.inter_op_num_threads = 1 | |||
op_num_threads = config.get('op_num_threads', 1) | |||
options.intra_op_num_threads = op_num_threads | |||
options.inter_op_num_threads = op_num_threads | |||
if device_type == Devices.gpu: | |||
sess = rt.InferenceSession( | |||
model_file, | |||
@@ -84,37 +88,43 @@ class ActionDetONNX(Model): | |||
def forward_video(self, video_name, scale): | |||
min_size, max_size = self._get_sizes(scale) | |||
tmp_dir = osp.join( | |||
self.tmp_dir, | |||
str(uuid.uuid1()) + '_' + osp.basename(video_name)[:-4]) | |||
if osp.exists(tmp_dir): | |||
shutil.rmtree(tmp_dir) | |||
os.makedirs(tmp_dir) | |||
url_parsed = urlparse(video_name) | |||
frame_rate = 2 | |||
cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \ | |||
f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg' | |||
cmd = cmd.split(' ') | |||
subprocess.call(cmd) | |||
frame_names = [ | |||
osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir)) | |||
if name.endswith('.jpg') | |||
] | |||
frame_names = [ | |||
frame_names[i:i + frame_rate * 2] | |||
for i in range(0, | |||
len(frame_names) - frame_rate * 2 + 1, frame_rate | |||
* self.temporal_stride) | |||
] | |||
timestamp = list( | |||
range(1, | |||
len(frame_names) * self.temporal_stride, | |||
self.temporal_stride)) | |||
batch_imgs = [self.parse_frames(names) for names in frame_names] | |||
shutil.rmtree(tmp_dir) | |||
with TemporaryDirectory() as temporary_cache_dir: | |||
if url_parsed.scheme in ('file', '') and osp.exists( | |||
url_parsed.path): | |||
local_video_name = video_name | |||
else: | |||
random_str = str(uuid.uuid1()) | |||
http_get_file( | |||
url=video_name, | |||
local_dir=temporary_cache_dir, | |||
file_name=random_str, | |||
headers={}, | |||
cookies=None) | |||
local_video_name = osp.join(temporary_cache_dir, random_str) | |||
cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \ | |||
f' -i {local_video_name} -r {frame_rate} -f' + \ | |||
f' image2 {temporary_cache_dir}/%06d_out.jpg' | |||
cmd = cmd.split(' ') | |||
subprocess.call(cmd) | |||
frame_names = [ | |||
osp.join(temporary_cache_dir, name) | |||
for name in sorted(os.listdir(temporary_cache_dir)) | |||
if name.endswith('_out.jpg') | |||
] | |||
frame_names = [ | |||
frame_names[i:i + frame_rate * 2] | |||
for i in range(0, | |||
len(frame_names) - frame_rate * 2 | |||
+ 1, frame_rate * self.temporal_stride) | |||
] | |||
timestamp = list( | |||
range(1, | |||
len(frame_names) * self.temporal_stride, | |||
self.temporal_stride)) | |||
batch_imgs = [self.parse_frames(names) for names in frame_names] | |||
N, _, T, H, W = batch_imgs[0].shape | |||
scale_min = min_size / min(H, W) | |||
h, w = min(int(scale_min * H), | |||
@@ -224,8 +224,8 @@ class BodyKeypointsDetection3D(TorchModel): | |||
lst_pose2d_cannoical.append(pose2d_canonical[:, | |||
i - pad:i + pad + 1]) | |||
input_pose2d_rr = torch.concat(lst_pose2d_cannoical, axis=0) | |||
input_pose2d_cannoical = torch.concat(lst_pose2d_cannoical, axis=0) | |||
input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0) | |||
input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0) | |||
if self.cfg.model.MODEL.USE_CANONICAL_COORDS: | |||
input_pose2d_abs = input_pose2d_cannoical.clone() | |||
@@ -0,0 +1,20 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .fair_face import FaceAttributeRecognition | |||
else: | |||
_import_structure = {'fair_face': ['FaceAttributeRecognition']} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,2 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .face_attribute_recognition import FaceAttributeRecognition |
@@ -0,0 +1,79 @@ | |||
# The implementation is based on FairFace, available at | |||
# https://github.com/dchen236/FairFace | |||
import os | |||
import cv2 | |||
import numpy as np | |||
import torch | |||
import torch.backends.cudnn as cudnn | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torchvision | |||
from PIL import Image | |||
from torch.autograd import Variable | |||
from torchvision import datasets, models, transforms | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base import Tensor, TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
@MODELS.register_module( | |||
Tasks.face_attribute_recognition, module_name=Models.fairface) | |||
class FaceAttributeRecognition(TorchModel): | |||
def __init__(self, model_path, device='cuda'): | |||
super().__init__(model_path) | |||
cudnn.benchmark = True | |||
self.model_path = model_path | |||
self.device = device | |||
self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE, | |||
ModelFile.CONFIGURATION) | |||
fair_face = torchvision.models.resnet34(pretrained=False) | |||
fair_face.fc = nn.Linear(fair_face.fc.in_features, 18) | |||
self.net = fair_face | |||
self.load_model() | |||
self.net = self.net.to(device) | |||
self.trans = transforms.Compose([ | |||
transforms.ToPILImage(), | |||
transforms.Resize((224, 224)), | |||
transforms.ToTensor(), | |||
transforms.Normalize( | |||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
]) | |||
def load_model(self, load_to_cpu=False): | |||
pretrained_dict = torch.load( | |||
self.model_path, map_location=torch.device('cpu')) | |||
self.net.load_state_dict(pretrained_dict, strict=True) | |||
self.net.eval() | |||
def forward(self, img): | |||
""" FariFace model forward process. | |||
Args: | |||
img: [h, w, c] | |||
Return: | |||
list of attribute result: [gender_score, age_score] | |||
""" | |||
img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2RGB) | |||
img = img.astype(np.uint8) | |||
inputs = self.trans(img) | |||
c, h, w = inputs.shape | |||
inputs = inputs.view(-1, c, h, w) | |||
inputs = inputs.to(self.device) | |||
inputs = Variable(inputs, volatile=True) | |||
outputs = self.net(inputs)[0] | |||
gender_outputs = outputs[7:9] | |||
age_outputs = outputs[9:18] | |||
gender_score = F.softmax(gender_outputs).detach().cpu().tolist() | |||
age_score = F.softmax(age_outputs).detach().cpu().tolist() | |||
return [gender_score, age_score] |
@@ -9,13 +9,14 @@ if TYPE_CHECKING: | |||
from .retinaface import RetinaFaceDetection | |||
from .ulfd_slim import UlfdFaceDetector | |||
from .scrfd import ScrfdDetect | |||
from .scrfd import TinyMogDetect | |||
else: | |||
_import_structure = { | |||
'ulfd_slim': ['UlfdFaceDetector'], | |||
'retinaface': ['RetinaFaceDetection'], | |||
'mtcnn': ['MtcnnFaceDetector'], | |||
'mogface': ['MogFaceDetector'], | |||
'scrfd': ['ScrfdDetect'] | |||
'scrfd': ['TinyMogDetect', 'ScrfdDetect'], | |||
} | |||
import sys | |||
@@ -1,2 +1,3 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .scrfd_detect import ScrfdDetect | |||
from .tinymog_detect import TinyMogDetect |
@@ -2,6 +2,7 @@ | |||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones | |||
""" | |||
from .mobilenet import MobileNetV1 | |||
from .resnet import ResNetV1e | |||
__all__ = ['ResNetV1e'] | |||
__all__ = ['ResNetV1e', 'MobileNetV1'] |
@@ -0,0 +1,99 @@ | |||
""" | |||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/backbones/mobilenet.py | |||
""" | |||
import torch | |||
import torch.nn as nn | |||
from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer, | |||
constant_init, kaiming_init) | |||
from mmcv.runner import load_checkpoint | |||
from mmdet.models.builder import BACKBONES | |||
from mmdet.utils import get_root_logger | |||
from torch.nn.modules.batchnorm import _BatchNorm | |||
@BACKBONES.register_module() | |||
class MobileNetV1(nn.Module): | |||
def __init__(self, | |||
in_channels=3, | |||
block_cfg=None, | |||
num_stages=4, | |||
out_indices=(0, 1, 2, 3)): | |||
super(MobileNetV1, self).__init__() | |||
self.out_indices = out_indices | |||
def conv_bn(inp, oup, stride): | |||
return nn.Sequential( | |||
nn.Conv2d(inp, oup, 3, stride, 1, bias=False), | |||
nn.BatchNorm2d(oup), nn.ReLU(inplace=True)) | |||
def conv_dw(inp, oup, stride): | |||
return nn.Sequential( | |||
nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), | |||
nn.BatchNorm2d(inp), | |||
nn.ReLU(inplace=True), | |||
nn.Conv2d(inp, oup, 1, 1, 0, bias=False), | |||
nn.BatchNorm2d(oup), | |||
nn.ReLU(inplace=True), | |||
) | |||
if block_cfg is None: | |||
stage_planes = [8, 16, 32, 64, 128, 256] | |||
stage_blocks = [2, 4, 4, 2] | |||
else: | |||
stage_planes = block_cfg['stage_planes'] | |||
stage_blocks = block_cfg['stage_blocks'] | |||
assert len(stage_planes) == 6 | |||
assert len(stage_blocks) == 4 | |||
self.stem = nn.Sequential( | |||
conv_bn(3, stage_planes[0], 2), | |||
conv_dw(stage_planes[0], stage_planes[1], 1), | |||
) | |||
self.stage_layers = [] | |||
for i, num_blocks in enumerate(stage_blocks): | |||
_layers = [] | |||
for n in range(num_blocks): | |||
if n == 0: | |||
_layer = conv_dw(stage_planes[i + 1], stage_planes[i + 2], | |||
2) | |||
else: | |||
_layer = conv_dw(stage_planes[i + 2], stage_planes[i + 2], | |||
1) | |||
_layers.append(_layer) | |||
_block = nn.Sequential(*_layers) | |||
layer_name = f'layer{i + 1}' | |||
self.add_module(layer_name, _block) | |||
self.stage_layers.append(layer_name) | |||
def forward(self, x): | |||
output = [] | |||
x = self.stem(x) | |||
for i, layer_name in enumerate(self.stage_layers): | |||
stage_layer = getattr(self, layer_name) | |||
x = stage_layer(x) | |||
if i in self.out_indices: | |||
output.append(x) | |||
return tuple(output) | |||
def init_weights(self, pretrained=None): | |||
"""Initialize the weights in backbone. | |||
Args: | |||
pretrained (str, optional): Path to pre-trained weights. | |||
Defaults to None. | |||
""" | |||
if isinstance(pretrained, str): | |||
logger = get_root_logger() | |||
load_checkpoint(self, pretrained, strict=False, logger=logger) | |||
elif pretrained is None: | |||
for m in self.modules(): | |||
if isinstance(m, nn.Conv2d): | |||
kaiming_init(m) | |||
elif isinstance(m, (_BatchNorm, nn.GroupNorm)): | |||
constant_init(m, 1) | |||
else: | |||
raise TypeError('pretrained must be a str or None') |
@@ -3,5 +3,6 @@ The implementation here is modified based on insightface, originally MIT license | |||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors | |||
""" | |||
from .scrfd import SCRFD | |||
from .tinymog import TinyMog | |||
__all__ = ['SCRFD'] | |||
__all__ = ['SCRFD', 'TinyMog'] |
@@ -0,0 +1,148 @@ | |||
""" | |||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at | |||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/models/detectors/scrfd.py | |||
""" | |||
import torch | |||
from mmdet.models.builder import DETECTORS | |||
from mmdet.models.detectors.single_stage import SingleStageDetector | |||
from ....mmdet_patch.core.bbox import bbox2result | |||
@DETECTORS.register_module() | |||
class TinyMog(SingleStageDetector): | |||
def __init__(self, | |||
backbone, | |||
neck, | |||
bbox_head, | |||
train_cfg=None, | |||
test_cfg=None, | |||
pretrained=None): | |||
super(TinyMog, self).__init__(backbone, neck, bbox_head, train_cfg, | |||
test_cfg, pretrained) | |||
def forward_train(self, | |||
img, | |||
img_metas, | |||
gt_bboxes, | |||
gt_labels, | |||
gt_keypointss=None, | |||
gt_bboxes_ignore=None): | |||
""" | |||
Args: | |||
img (Tensor): Input images of shape (N, C, H, W). | |||
Typically these should be mean centered and std scaled. | |||
img_metas (list[dict]): A List of image info dict where each dict | |||
has: 'img_shape', 'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
:class:`mmdet.datasets.pipelines.Collect`. | |||
gt_bboxes (list[Tensor]): Each item are the truth boxes for each | |||
image in [tl_x, tl_y, br_x, br_y] format. | |||
gt_labels (list[Tensor]): Class indices corresponding to each box | |||
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding | |||
boxes can be ignored when computing the loss. | |||
Returns: | |||
dict[str, Tensor]: A dictionary of loss components. | |||
""" | |||
super(SingleStageDetector, self).forward_train(img, img_metas) | |||
x = self.extract_feat(img) | |||
losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, | |||
gt_labels, gt_keypointss, | |||
gt_bboxes_ignore) | |||
return losses | |||
def simple_test(self, | |||
img, | |||
img_metas, | |||
rescale=False, | |||
repeat_head=1, | |||
output_kps_var=0, | |||
output_results=1): | |||
"""Test function without test time augmentation. | |||
Args: | |||
imgs (list[torch.Tensor]): List of multiple images | |||
img_metas (list[dict]): List of image information. | |||
rescale (bool, optional): Whether to rescale the results. | |||
Defaults to False. | |||
repeat_head (int): repeat inference times in head | |||
output_kps_var (int): whether output kps var to calculate quality | |||
output_results (int): 0: nothing 1: bbox 2: both bbox and kps | |||
Returns: | |||
list[list[np.ndarray]]: BBox results of each image and classes. | |||
The outer list corresponds to each image. The inner list | |||
corresponds to each class. | |||
""" | |||
x = self.extract_feat(img) | |||
assert repeat_head >= 1 | |||
kps_out0 = [] | |||
kps_out1 = [] | |||
kps_out2 = [] | |||
for i in range(repeat_head): | |||
outs = self.bbox_head(x) | |||
kps_out0 += [outs[2][0].detach().cpu().numpy()] | |||
kps_out1 += [outs[2][1].detach().cpu().numpy()] | |||
kps_out2 += [outs[2][2].detach().cpu().numpy()] | |||
if output_kps_var: | |||
var0 = np.var(np.vstack(kps_out0), axis=0).mean() | |||
var1 = np.var(np.vstack(kps_out1), axis=0).mean() | |||
var2 = np.var(np.vstack(kps_out2), axis=0).mean() | |||
var = np.mean([var0, var1, var2]) | |||
else: | |||
var = None | |||
if output_results > 0: | |||
if torch.onnx.is_in_onnx_export(): | |||
cls_score, bbox_pred, kps_pred = outs | |||
for c in cls_score: | |||
print(c.shape) | |||
for c in bbox_pred: | |||
print(c.shape) | |||
if self.bbox_head.use_kps: | |||
for c in kps_pred: | |||
print(c.shape) | |||
return (cls_score, bbox_pred, kps_pred) | |||
else: | |||
return (cls_score, bbox_pred) | |||
bbox_list = self.bbox_head.get_bboxes( | |||
*outs, img_metas, rescale=rescale) | |||
# return kps if use_kps | |||
if len(bbox_list[0]) == 2: | |||
bbox_results = [ | |||
bbox2result(det_bboxes, det_labels, | |||
self.bbox_head.num_classes) | |||
for det_bboxes, det_labels in bbox_list | |||
] | |||
elif len(bbox_list[0]) == 3: | |||
if output_results == 2: | |||
bbox_results = [ | |||
bbox2result( | |||
det_bboxes, | |||
det_labels, | |||
self.bbox_head.num_classes, | |||
kps=det_kps, | |||
num_kps=self.bbox_head.NK) | |||
for det_bboxes, det_labels, det_kps in bbox_list | |||
] | |||
elif output_results == 1: | |||
bbox_results = [ | |||
bbox2result(det_bboxes, det_labels, | |||
self.bbox_head.num_classes) | |||
for det_bboxes, det_labels, _ in bbox_list | |||
] | |||
else: | |||
bbox_results = None | |||
if var is not None: | |||
return bbox_results, var | |||
else: | |||
return bbox_results | |||
def feature_test(self, img): | |||
x = self.extract_feat(img) | |||
outs = self.bbox_head(x) | |||
return outs |
@@ -0,0 +1,67 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os.path as osp | |||
from copy import deepcopy | |||
from typing import Any, Dict | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
__all__ = ['TinyMogDetect'] | |||
@MODELS.register_module(Tasks.face_detection, module_name=Models.tinymog) | |||
class TinyMogDetect(TorchModel): | |||
def __init__(self, model_dir, *args, **kwargs): | |||
""" | |||
initialize the tinymog face detection model from the `model_dir` path. | |||
""" | |||
super().__init__(model_dir) | |||
from mmcv import Config | |||
from mmcv.parallel import MMDataParallel | |||
from mmcv.runner import load_checkpoint | |||
from mmdet.models import build_detector | |||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset | |||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop | |||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e | |||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead | |||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD | |||
cfg = Config.fromfile(osp.join(model_dir, 'mmcv_tinymog.py')) | |||
ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3) | |||
detector = build_detector(cfg.model) | |||
logger.info(f'loading model from {ckpt_path}') | |||
load_checkpoint(detector, ckpt_path, map_location='cpu') | |||
detector = MMDataParallel(detector) | |||
detector.eval() | |||
self.detector = detector | |||
logger.info('load model done') | |||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
result = self.detector( | |||
return_loss=False, | |||
rescale=True, | |||
img=[input['img'][0].unsqueeze(0)], | |||
img_metas=[[dict(input['img_metas'][0].data)]], | |||
output_results=2) | |||
assert result is not None | |||
result = result[0][0] | |||
bboxes = result[:, :4].tolist() | |||
kpss = result[:, 5:].tolist() | |||
scores = result[:, 4].tolist() | |||
return { | |||
OutputKeys.SCORES: scores, | |||
OutputKeys.BOXES: bboxes, | |||
OutputKeys.KEYPOINTS: kpss | |||
} | |||
def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]: | |||
return input |
@@ -0,0 +1,200 @@ | |||
# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at | |||
# https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/backbones/iresnet.py | |||
import torch | |||
from torch import nn | |||
from torch.utils.checkpoint import checkpoint | |||
using_ckpt = False | |||
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): | |||
"""3x3 convolution with padding""" | |||
return nn.Conv2d( | |||
in_planes, | |||
out_planes, | |||
kernel_size=3, | |||
stride=stride, | |||
padding=dilation, | |||
groups=groups, | |||
bias=False, | |||
dilation=dilation) | |||
def conv1x1(in_planes, out_planes, stride=1): | |||
"""1x1 convolution""" | |||
return nn.Conv2d( | |||
in_planes, out_planes, kernel_size=1, stride=stride, bias=False) | |||
class IBasicBlock(nn.Module): | |||
expansion = 1 | |||
def __init__(self, | |||
inplanes, | |||
planes, | |||
stride=1, | |||
downsample=None, | |||
groups=1, | |||
base_width=64, | |||
dilation=1): | |||
super(IBasicBlock, self).__init__() | |||
if groups != 1 or base_width != 64: | |||
raise ValueError( | |||
'BasicBlock only supports groups=1 and base_width=64') | |||
if dilation > 1: | |||
raise NotImplementedError( | |||
'Dilation > 1 not supported in BasicBlock') | |||
self.bn1 = nn.BatchNorm2d( | |||
inplanes, | |||
eps=1e-05, | |||
) | |||
self.conv1 = conv3x3(inplanes, planes) | |||
self.bn2 = nn.BatchNorm2d( | |||
planes, | |||
eps=1e-05, | |||
) | |||
self.prelu = nn.PReLU(planes) | |||
self.conv2 = conv3x3(planes, planes, stride) | |||
self.bn3 = nn.BatchNorm2d( | |||
planes, | |||
eps=1e-05, | |||
) | |||
self.downsample = downsample | |||
self.stride = stride | |||
def forward(self, x): | |||
identity = x | |||
out = self.bn1(x) | |||
out = self.conv1(out) | |||
out = self.bn2(out) | |||
out = self.prelu(out) | |||
out = self.conv2(out) | |||
out = self.bn3(out) | |||
if self.downsample is not None: | |||
identity = self.downsample(x) | |||
out += identity | |||
return out | |||
class IResNet(nn.Module): | |||
fc_scale = 7 * 7 | |||
def __init__(self, | |||
block, | |||
layers, | |||
dropout=0, | |||
num_features=512, | |||
zero_init_residual=False, | |||
groups=1, | |||
width_per_group=64, | |||
replace_stride_with_dilation=None, | |||
fp16=False): | |||
super(IResNet, self).__init__() | |||
self.extra_gflops = 0.0 | |||
self.fp16 = fp16 | |||
self.inplanes = 64 | |||
self.dilation = 1 | |||
if replace_stride_with_dilation is None: | |||
replace_stride_with_dilation = [False, False, False] | |||
if len(replace_stride_with_dilation) != 3: | |||
raise ValueError('replace_stride_with_dilation should be None ' | |||
'or a 3-element tuple, got {}'.format( | |||
replace_stride_with_dilation)) | |||
self.groups = groups | |||
self.base_width = width_per_group | |||
self.conv1 = nn.Conv2d( | |||
3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) | |||
self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05) | |||
self.prelu = nn.PReLU(self.inplanes) | |||
self.layer1 = self._make_layer(block, 64, layers[0], stride=2) | |||
self.layer2 = self._make_layer( | |||
block, | |||
128, | |||
layers[1], | |||
stride=2, | |||
dilate=replace_stride_with_dilation[0]) | |||
self.layer3 = self._make_layer( | |||
block, | |||
256, | |||
layers[2], | |||
stride=2, | |||
dilate=replace_stride_with_dilation[1]) | |||
self.layer4 = self._make_layer( | |||
block, | |||
512, | |||
layers[3], | |||
stride=2, | |||
dilate=replace_stride_with_dilation[2]) | |||
self.bn2 = nn.BatchNorm2d( | |||
512 * block.expansion, | |||
eps=1e-05, | |||
) | |||
self.dropout = nn.Dropout(p=dropout, inplace=True) | |||
self.fc = nn.Linear(512 * block.expansion * self.fc_scale, | |||
num_features) | |||
self.features = nn.BatchNorm1d(num_features, eps=1e-05) | |||
nn.init.constant_(self.features.weight, 1.0) | |||
self.features.weight.requires_grad = False | |||
for m in self.modules(): | |||
if isinstance(m, nn.Conv2d): | |||
nn.init.normal_(m.weight, 0, 0.1) | |||
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): | |||
nn.init.constant_(m.weight, 1) | |||
nn.init.constant_(m.bias, 0) | |||
if zero_init_residual: | |||
for m in self.modules(): | |||
if isinstance(m, IBasicBlock): | |||
nn.init.constant_(m.bn2.weight, 0) | |||
def _make_layer(self, block, planes, blocks, stride=1, dilate=False): | |||
downsample = None | |||
previous_dilation = self.dilation | |||
if dilate: | |||
self.dilation *= stride | |||
stride = 1 | |||
if stride != 1 or self.inplanes != planes * block.expansion: | |||
downsample = nn.Sequential( | |||
conv1x1(self.inplanes, planes * block.expansion, stride), | |||
nn.BatchNorm2d( | |||
planes * block.expansion, | |||
eps=1e-05, | |||
), | |||
) | |||
layers = [] | |||
layers.append( | |||
block(self.inplanes, planes, stride, downsample, self.groups, | |||
self.base_width, previous_dilation)) | |||
self.inplanes = planes * block.expansion | |||
for _ in range(1, blocks): | |||
layers.append( | |||
block( | |||
self.inplanes, | |||
planes, | |||
groups=self.groups, | |||
base_width=self.base_width, | |||
dilation=self.dilation)) | |||
return nn.Sequential(*layers) | |||
def forward(self, x): | |||
with torch.cuda.amp.autocast(self.fp16): | |||
x = self.conv1(x) | |||
x = self.bn1(x) | |||
x = self.prelu(x) | |||
x = self.layer1(x) | |||
x = self.layer2(x) | |||
x = self.layer3(x) | |||
x = self.layer4(x) | |||
x = self.bn2(x) | |||
x = torch.flatten(x, 1) | |||
x = self.dropout(x) | |||
x = self.fc(x.float() if self.fp16 else x) | |||
x = self.features(x) | |||
return x | |||
def _iresnet(arch, layers): | |||
model = IResNet(IBasicBlock, layers) | |||
return model |
@@ -0,0 +1,213 @@ | |||
# The implementation is adopted from InsightFace, made pubicly available under the Apache-2.0 license at | |||
# https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py | |||
from collections import namedtuple | |||
import torch | |||
import torch.nn.functional as F | |||
from torch import nn | |||
from torch.nn import (AdaptiveAvgPool2d, AvgPool2d, BatchNorm1d, BatchNorm2d, | |||
Conv2d, Dropout, Dropout2d, Linear, MaxPool2d, Module, | |||
Parameter, PReLU, ReLU, Sequential, Sigmoid) | |||
class Flatten(Module): | |||
def forward(self, input): | |||
return input.view(input.size(0), -1) | |||
class SEModule(Module): | |||
def __init__(self, channels, reduction): | |||
super(SEModule, self).__init__() | |||
self.avg_pool = AdaptiveAvgPool2d(1) | |||
self.fc1 = Conv2d( | |||
channels, | |||
channels // reduction, | |||
kernel_size=1, | |||
padding=0, | |||
bias=False) | |||
self.relu = ReLU(inplace=True) | |||
self.fc2 = Conv2d( | |||
channels // reduction, | |||
channels, | |||
kernel_size=1, | |||
padding=0, | |||
bias=False) | |||
self.sigmoid = Sigmoid() | |||
def forward(self, x): | |||
module_input = x | |||
x = self.avg_pool(x) | |||
x = self.fc1(x) | |||
x = self.relu(x) | |||
x = self.fc2(x) | |||
x = self.sigmoid(x) | |||
return module_input * x | |||
class BottleneckIR(Module): | |||
def __init__(self, in_channel, depth, stride): | |||
super(BottleneckIR, self).__init__() | |||
if in_channel == depth: | |||
self.shortcut_layer = MaxPool2d(1, stride) | |||
else: | |||
self.shortcut_layer = Sequential( | |||
Conv2d(in_channel, depth, (1, 1), stride, bias=False), | |||
BatchNorm2d(depth)) | |||
self.res_layer = Sequential( | |||
BatchNorm2d(in_channel), | |||
Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), | |||
PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False), | |||
BatchNorm2d(depth)) | |||
def forward(self, x): | |||
shortcut = self.shortcut_layer(x) | |||
res = self.res_layer(x) | |||
return res + shortcut | |||
class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])): | |||
'''A named tuple describing a ResNet block.''' | |||
def get_block(in_channel, depth, num_units, stride=2): | |||
return [Bottleneck(in_channel, depth, stride) | |||
] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)] | |||
def get_blocks(num_layers): | |||
if num_layers == 50: | |||
blocks = [ | |||
get_block(in_channel=64, depth=64, num_units=3), | |||
get_block(in_channel=64, depth=128, num_units=4), | |||
get_block(in_channel=128, depth=256, num_units=14), | |||
get_block(in_channel=256, depth=512, num_units=3) | |||
] | |||
elif num_layers == 100: | |||
blocks = [ | |||
get_block(in_channel=64, depth=64, num_units=3), | |||
get_block(in_channel=64, depth=128, num_units=13), | |||
get_block(in_channel=128, depth=256, num_units=30), | |||
get_block(in_channel=256, depth=512, num_units=3) | |||
] | |||
elif num_layers == 152: | |||
blocks = [ | |||
get_block(in_channel=64, depth=64, num_units=3), | |||
get_block(in_channel=64, depth=128, num_units=8), | |||
get_block(in_channel=128, depth=256, num_units=36), | |||
get_block(in_channel=256, depth=512, num_units=3) | |||
] | |||
elif num_layers == 252: | |||
blocks = [ | |||
get_block(in_channel=64, depth=64, num_units=6), | |||
get_block(in_channel=64, depth=128, num_units=21), | |||
get_block(in_channel=128, depth=256, num_units=66), | |||
get_block(in_channel=256, depth=512, num_units=6) | |||
] | |||
return blocks | |||
class IResNet(Module): | |||
def __init__(self, | |||
dropout=0, | |||
num_features=512, | |||
zero_init_residual=False, | |||
groups=1, | |||
width_per_group=64, | |||
replace_stride_with_dilation=None, | |||
fp16=False, | |||
with_wcd=False, | |||
wrs_M=400, | |||
wrs_q=0.9): | |||
super(IResNet, self).__init__() | |||
num_layers = 252 | |||
mode = 'ir' | |||
assert num_layers in [50, 100, 152, | |||
252], 'num_layers should be 50,100, or 152' | |||
assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se' | |||
self.fc_scale = 7 * 7 | |||
num_features = 512 | |||
self.fp16 = fp16 | |||
drop_ratio = 0.0 | |||
self.with_wcd = with_wcd | |||
if self.with_wcd: | |||
self.wrs_M = wrs_M | |||
self.wrs_q = wrs_q | |||
blocks = get_blocks(num_layers) | |||
if mode == 'ir': | |||
unit_module = BottleneckIR | |||
self.input_layer = Sequential( | |||
Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64), | |||
PReLU(64)) | |||
self.bn2 = nn.BatchNorm2d( | |||
512, | |||
eps=1e-05, | |||
) | |||
self.dropout = nn.Dropout(p=drop_ratio, inplace=True) | |||
self.fc = nn.Linear(512 * self.fc_scale, num_features) | |||
self.features = nn.BatchNorm1d(num_features, eps=1e-05) | |||
nn.init.constant_(self.features.weight, 1.0) | |||
self.features.weight.requires_grad = False | |||
modules = [] | |||
for block in blocks: | |||
for bottleneck in block: | |||
modules.append( | |||
unit_module(bottleneck.in_channel, bottleneck.depth, | |||
bottleneck.stride)) | |||
self.body = Sequential(*modules) | |||
def forward(self, x): | |||
with torch.cuda.amp.autocast(self.fp16): | |||
x = self.input_layer(x) | |||
x = self.body(x) | |||
x = self.bn2(x) | |||
if self.with_wcd: | |||
B = x.size()[0] | |||
C = x.size()[1] | |||
x_abs = torch.abs(x) | |||
score = torch.nn.functional.adaptive_avg_pool2d(x_abs, | |||
1).reshape( | |||
(B, C)) | |||
r = torch.rand((B, C), device=x.device) | |||
key = torch.pow(r, 1. / score) | |||
_, topidx = torch.topk(key, self.wrs_M, dim=1) | |||
mask = torch.zeros_like(key, dtype=torch.float32) | |||
mask.scatter_(1, topidx, 1.) | |||
maskq = torch.rand((B, C), device=x.device) | |||
maskq_ones = torch.ones_like(maskq, dtype=torch.float32) | |||
maskq_zeros = torch.zeros_like(maskq, dtype=torch.float32) | |||
maskq_m = torch.where(maskq < self.wrs_q, maskq_ones, | |||
maskq_zeros) | |||
new_mask = mask * maskq_m | |||
score_sum = torch.sum(score, dim=1, keepdim=True) | |||
selected_score_sum = torch.sum( | |||
new_mask * score, dim=1, keepdim=True) | |||
alpha = score_sum / (selected_score_sum + 1e-6) | |||
alpha = alpha.reshape((B, 1, 1, 1)) | |||
new_mask = new_mask.reshape((B, C, 1, 1)) | |||
x = x * new_mask * alpha | |||
x = torch.flatten(x, 1) | |||
x = self.dropout(x) | |||
x = self.fc(x.float() if self.fp16 else x) | |||
x = self.features(x) | |||
return x | |||
def iresnet286(pretrained=False, progress=True, **kwargs): | |||
model = IResNet( | |||
dropout=0, | |||
num_features=512, | |||
zero_init_residual=False, | |||
groups=1, | |||
width_per_group=64, | |||
replace_stride_with_dilation=None, | |||
fp16=False, | |||
with_wcd=False, | |||
wrs_M=400, | |||
wrs_q=0.9) | |||
return model |
@@ -0,0 +1,20 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .flc import FacialLandmarkConfidence | |||
else: | |||
_import_structure = {'flc': ['FacialLandmarkConfidence']} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,2 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .facial_landmark_confidence import FacialLandmarkConfidence |
@@ -0,0 +1,94 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import cv2 | |||
import numpy as np | |||
import torch | |||
import torch.backends.cudnn as cudnn | |||
import torch.nn.functional as F | |||
from PIL import Image | |||
from torch.autograd import Variable | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base import Tensor, TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .manual_landmark_net import LandmarkConfidence | |||
@MODELS.register_module( | |||
Tasks.facial_landmark_confidence, module_name=Models.flc) | |||
class FacialLandmarkConfidence(TorchModel): | |||
def __init__(self, model_path, device='cuda'): | |||
super().__init__(model_path) | |||
cudnn.benchmark = True | |||
self.model_path = model_path | |||
self.device = device | |||
self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE, | |||
ModelFile.CONFIGURATION) | |||
self.landmark_count = 5 | |||
self.net = LandmarkConfidence(landmark_count=self.landmark_count) | |||
self.load_model() | |||
self.net = self.net.to(device) | |||
def load_model(self, load_to_cpu=False): | |||
pretrained_dict = torch.load( | |||
self.model_path, map_location=torch.device('cpu'))['state_dict'] | |||
pretrained_dict['rp_net.binary_cls.weight'] = 32.0 * F.normalize( | |||
pretrained_dict['rp_net.binary_cls.weight'], dim=1).t() | |||
self.net.load_state_dict(pretrained_dict, strict=True) | |||
self.net.eval() | |||
def forward(self, input): | |||
img_org = input['orig_img'] | |||
bbox = input['bbox'] | |||
img_org = img_org.cpu().numpy() | |||
image_height = img_org.shape[0] | |||
image_width = img_org.shape[1] | |||
x1 = max(0, int(bbox[0])) | |||
y1 = max(0, int(bbox[1])) | |||
x2 = min(image_width, int(bbox[2])) | |||
y2 = min(image_height, int(bbox[3])) | |||
box_w = x2 - x1 + 1 | |||
box_h = y2 - y1 + 1 | |||
if box_h > box_w: | |||
delta = box_h - box_w | |||
dy = edy = 0 | |||
dx = delta // 2 | |||
edx = delta - dx | |||
else: | |||
dx = edx = 0 | |||
delta = box_w - box_h | |||
dy = delta // 2 | |||
edy = delta - dy | |||
cv_img = img_org[y1:y2, x1:x2] | |||
if dx > 0 or dy > 0 or edx > 0 or edy > 0: | |||
cv_img = cv2.copyMakeBorder(cv_img, dy, edy, dx, edx, | |||
cv2.BORDER_CONSTANT, 0) | |||
inter_x = cv_img.shape[1] | |||
inter_y = cv_img.shape[0] | |||
cv_img = cv2.resize(cv_img, (120, 120)) | |||
cv_img = cv_img.transpose((2, 0, 1)) | |||
input_blob = torch.from_numpy(cv_img[np.newaxis, :, :, :].astype( | |||
np.float32)) | |||
tmp_conf_lms, tmp_feat, tmp_conf_resp, tmp_nose = self.net( | |||
input_blob.to(self.device)) | |||
conf_lms = tmp_conf_lms.cpu().numpy().squeeze() | |||
feat = tmp_feat.cpu().numpy().squeeze() | |||
pts5pt = [] | |||
for i in range(feat.shape[0]): | |||
if i < self.landmark_count: | |||
pts5pt.append(feat[i] * inter_x - dx + x1) | |||
else: | |||
pts5pt.append(feat[i] * inter_y - dy + y1) | |||
lm5pt = np.array(pts5pt).reshape(2, 5).T | |||
return lm5pt, conf_lms |
@@ -0,0 +1,152 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import math | |||
import torch | |||
import torch.nn.functional as F | |||
from torch.nn import (AdaptiveAvgPool2d, BatchNorm2d, Conv2d, Linear, | |||
MaxPool2d, Module, Parameter, ReLU, Sequential) | |||
class LandmarkConfidence(Module): | |||
def __init__(self, landmark_count=5): | |||
super(LandmarkConfidence, self).__init__() | |||
self.landmark_net = LandmarkNetD(landmark_count) | |||
self.landmark_net.eval() | |||
self.cls_net = ClassNet() | |||
self.cls_net.eval() | |||
self.rp_net = RespiratorNet() | |||
def forward(self, x): | |||
feat, nose_feat, lms = self.landmark_net(x) | |||
cls_respirator, nose = self.rp_net(feat, nose_feat) | |||
confidence = self.cls_net(feat) | |||
return confidence, lms, cls_respirator, nose | |||
class FC(Module): | |||
def __init__(self, feat_dim=256, num_class=2): | |||
super(FC, self).__init__() | |||
self.weight = Parameter( | |||
torch.zeros(num_class, feat_dim, dtype=torch.float32)) | |||
def forward(self, x): | |||
cos_theta = F.linear(x, self.weight) | |||
return F.softmax(cos_theta, dim=1) | |||
class Flatten(Module): | |||
def forward(self, x): | |||
return torch.flatten(x, 1) | |||
class RespiratorNet(Module): | |||
def __init__(self): | |||
super(RespiratorNet, self).__init__() | |||
self.conv1 = Sequential( | |||
Conv2d(48, 48, 3, 2, 1), BatchNorm2d(48), ReLU(True)) | |||
self.conv2 = AdaptiveAvgPool2d( | |||
(1, 1) | |||
) # Sequential(Conv2d(48, 48, 5, 1, 0), BatchNorm2d(48), ReLU(True)) | |||
self.binary_cls = FC(feat_dim=48, num_class=2) | |||
self.nose_layer = Sequential( | |||
Conv2d(48, 64, 3, 1, 0), BatchNorm2d(64), ReLU(True), | |||
Conv2d(64, 64, 3, 1, 0), BatchNorm2d(64), ReLU(True), Flatten(), | |||
Linear(64, 96), ReLU(True), Linear(96, 6)) | |||
def train(self, mode=True): | |||
self.conv1.train(mode) | |||
self.conv2.train(mode) | |||
# self.nose_feat.train(mode) | |||
self.nose_layer.train(mode) | |||
self.binary_cls.train(mode) | |||
def forward(self, x, y): | |||
x = self.conv1(x) | |||
x = self.conv2(x) | |||
cls = self.binary_cls(torch.flatten(x, 1)) | |||
# loc = self.nose_feat(y) | |||
loc = self.nose_layer(y) | |||
return cls, loc | |||
class ClassNet(Module): | |||
def __init__(self): | |||
super(ClassNet, self).__init__() | |||
self.conv1 = Sequential( | |||
Conv2d(48, 48, 3, 1, 1), BatchNorm2d(48), ReLU(True)) | |||
self.conv2 = Sequential( | |||
Conv2d(48, 54, 3, 2, 1), BatchNorm2d(54), ReLU(True)) | |||
self.conv3 = Sequential( | |||
Conv2d(54, 54, 5, 1, 0), BatchNorm2d(54), ReLU(True)) | |||
self.fc1 = Sequential(Flatten(), Linear(54, 54), ReLU(True)) | |||
self.fc2 = Linear(54, 1) | |||
def forward(self, x): | |||
y = self.conv1(x) | |||
y = self.conv2(y) | |||
y = self.conv3(y) | |||
y = self.fc1(y) | |||
y = self.fc2(y) | |||
return y | |||
class LandmarkNetD(Module): | |||
def __init__(self, landmark_count=5): | |||
super(LandmarkNetD, self).__init__() | |||
self.conv_pre = Sequential( | |||
Conv2d(3, 16, 5, 2, 0), BatchNorm2d(16), ReLU(True)) | |||
self.pool_pre = MaxPool2d(2, 2) # output is 29 | |||
self.conv1 = Sequential( | |||
Conv2d(16, 32, 3, 1, 1), BatchNorm2d(32), ReLU(True), | |||
Conv2d(32, 32, 3, 1, 1), BatchNorm2d(32), ReLU(True)) | |||
self.pool1 = MaxPool2d(2, 2) # 14 | |||
self.conv2 = Sequential( | |||
Conv2d(32, 48, 3, 1, 0), BatchNorm2d(48), ReLU(True), | |||
Conv2d(48, 48, 3, 1, 0), BatchNorm2d(48), ReLU(True)) | |||
self.pool2 = MaxPool2d(2, 2) # 5 | |||
self.conv3 = Sequential( | |||
Conv2d(48, 80, 3, 1, 0), BatchNorm2d(80), ReLU(True), | |||
Conv2d(80, 80, 3, 1, 0), BatchNorm2d(80), ReLU(True)) | |||
self.fc1 = Sequential(Linear(80, 128), ReLU(True)) | |||
self.fc2 = Sequential(Linear(128, 128), ReLU(True)) | |||
self.output = Linear(128, landmark_count * 2) | |||
def _initialize_weights(self): | |||
for m in self.modules(): | |||
if isinstance(m, Conv2d): | |||
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels | |||
m.weight.data.normal_(0, math.sqrt(2. / n)) | |||
if m.bias is not None: | |||
m.bias.data.zero_() | |||
elif isinstance(m, BatchNorm2d): | |||
m.weight.data.fill_(1) | |||
m.bias.data.zero_() | |||
elif isinstance(m, Linear): | |||
n = m.weight.size(1) | |||
m.weight.data.normal_(0, 0.01) | |||
m.bias.data.zero_() | |||
def forward(self, x): | |||
y = self.conv_pre(x) | |||
y = self.pool_pre(y) | |||
y = self.conv1(y) | |||
y = self.pool1(y[:, :, :28, :28]) | |||
feat = self.conv2(y) | |||
y2 = self.pool2(feat) | |||
y = self.conv3(y2) | |||
y = torch.flatten(y, 1) | |||
y = self.fc1(y) | |||
y = self.fc2(y) | |||
y = self.output(y) | |||
return feat, y2, y |
@@ -0,0 +1,2 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .nextvit import NextViT |
@@ -0,0 +1,541 @@ | |||
# Part of the implementation is borrowed and modified from Next-ViT, | |||
# publicly available at https://github.com/bytedance/Next-ViT | |||
import collections.abc | |||
import itertools | |||
import math | |||
import os | |||
import warnings | |||
from functools import partial | |||
from typing import Dict, Sequence | |||
import torch | |||
import torch.nn as nn | |||
from einops import rearrange | |||
from mmcls.models.backbones.base_backbone import BaseBackbone | |||
from mmcls.models.builder import BACKBONES | |||
from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer | |||
from mmcv.runner import BaseModule | |||
from torch.nn.modules.batchnorm import _BatchNorm | |||
NORM_EPS = 1e-5 | |||
def _no_grad_trunc_normal_(tensor, mean, std, a, b): | |||
# Cut & paste from PyTorch official master until it's in a few official releases - RW | |||
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf | |||
def norm_cdf(x): | |||
# Computes standard normal cumulative distribution function | |||
return (1. + math.erf(x / math.sqrt(2.))) / 2. | |||
if (mean < a - 2 * std) or (mean > b + 2 * std): | |||
warnings.warn( | |||
'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' | |||
'The distribution of values may be incorrect.', | |||
stacklevel=2) | |||
with torch.no_grad(): | |||
# Values are generated by using a truncated uniform distribution and | |||
# then using the inverse CDF for the normal distribution. | |||
# Get upper and lower cdf values | |||
ll = norm_cdf((a - mean) / std) | |||
u = norm_cdf((b - mean) / std) | |||
# Uniformly fill tensor with values from [ll, u], then translate to | |||
# [2ll-1, 2u-1]. | |||
tensor.uniform_(2 * ll - 1, 2 * u - 1) | |||
# Use inverse cdf transform for normal distribution to get truncated | |||
# standard normal | |||
tensor.erfinv_() | |||
# Transform to proper mean, std | |||
tensor.mul_(std * math.sqrt(2.)) | |||
tensor.add_(mean) | |||
# Clamp to ensure it's in the proper range | |||
tensor.clamp_(min=a, max=b) | |||
return tensor | |||
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): | |||
return _no_grad_trunc_normal_(tensor, mean, std, a, b) | |||
class ConvBNReLU(nn.Module): | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride, | |||
groups=1): | |||
super(ConvBNReLU, self).__init__() | |||
self.conv = nn.Conv2d( | |||
in_channels, | |||
out_channels, | |||
kernel_size=kernel_size, | |||
stride=stride, | |||
padding=1, | |||
groups=groups, | |||
bias=False) | |||
self.norm = nn.BatchNorm2d(out_channels, eps=NORM_EPS) | |||
self.act = nn.ReLU(inplace=True) | |||
def forward(self, x): | |||
x = self.conv(x) | |||
x = self.norm(x) | |||
x = self.act(x) | |||
return x | |||
def _make_divisible(v, divisor, min_value=None): | |||
if min_value is None: | |||
min_value = divisor | |||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) | |||
# Make sure that round down does not go down by more than 10%. | |||
if new_v < 0.9 * v: | |||
new_v += divisor | |||
return new_v | |||
class PatchEmbed(nn.Module): | |||
def __init__(self, in_channels, out_channels, stride=1): | |||
super(PatchEmbed, self).__init__() | |||
norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS) | |||
if stride == 2: | |||
self.avgpool = nn.AvgPool2d((2, 2), | |||
stride=2, | |||
ceil_mode=True, | |||
count_include_pad=False) | |||
self.conv = nn.Conv2d( | |||
in_channels, out_channels, kernel_size=1, stride=1, bias=False) | |||
self.norm = norm_layer(out_channels) | |||
elif in_channels != out_channels: | |||
self.avgpool = nn.Identity() | |||
self.conv = nn.Conv2d( | |||
in_channels, out_channels, kernel_size=1, stride=1, bias=False) | |||
self.norm = norm_layer(out_channels) | |||
else: | |||
self.avgpool = nn.Identity() | |||
self.conv = nn.Identity() | |||
self.norm = nn.Identity() | |||
def forward(self, x): | |||
return self.norm(self.conv(self.avgpool(x))) | |||
class MHCA(nn.Module): | |||
""" | |||
Multi-Head Convolutional Attention | |||
""" | |||
def __init__(self, out_channels, head_dim): | |||
super(MHCA, self).__init__() | |||
norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS) | |||
self.group_conv3x3 = nn.Conv2d( | |||
out_channels, | |||
out_channels, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1, | |||
groups=out_channels // head_dim, | |||
bias=False) | |||
self.norm = norm_layer(out_channels) | |||
self.act = nn.ReLU(inplace=True) | |||
self.projection = nn.Conv2d( | |||
out_channels, out_channels, kernel_size=1, bias=False) | |||
def forward(self, x): | |||
out = self.group_conv3x3(x) | |||
out = self.norm(out) | |||
out = self.act(out) | |||
out = self.projection(out) | |||
return out | |||
class Mlp(nn.Module): | |||
def __init__(self, | |||
in_features, | |||
out_features=None, | |||
mlp_ratio=None, | |||
drop=0., | |||
bias=True): | |||
super().__init__() | |||
out_features = out_features or in_features | |||
hidden_dim = _make_divisible(in_features * mlp_ratio, 32) | |||
self.conv1 = nn.Conv2d( | |||
in_features, hidden_dim, kernel_size=1, bias=bias) | |||
self.act = nn.ReLU(inplace=True) | |||
self.conv2 = nn.Conv2d( | |||
hidden_dim, out_features, kernel_size=1, bias=bias) | |||
self.drop = nn.Dropout(drop) | |||
def forward(self, x): | |||
x = self.conv1(x) | |||
x = self.act(x) | |||
x = self.drop(x) | |||
x = self.conv2(x) | |||
x = self.drop(x) | |||
return x | |||
class NCB(nn.Module): | |||
""" | |||
Next Convolution Block | |||
""" | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
stride=1, | |||
path_dropout=0, | |||
drop=0, | |||
head_dim=32, | |||
mlp_ratio=3): | |||
super(NCB, self).__init__() | |||
self.in_channels = in_channels | |||
self.out_channels = out_channels | |||
norm_layer = partial(nn.BatchNorm2d, eps=NORM_EPS) | |||
assert out_channels % head_dim == 0 | |||
self.patch_embed = PatchEmbed(in_channels, out_channels, stride) | |||
self.mhca = MHCA(out_channels, head_dim) | |||
self.attention_path_dropout = DropPath(path_dropout) | |||
self.norm = norm_layer(out_channels) | |||
self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop, bias=True) | |||
self.mlp_path_dropout = DropPath(path_dropout) | |||
self.is_bn_merged = False | |||
def forward(self, x): | |||
x = self.patch_embed(x) | |||
x = x + self.attention_path_dropout(self.mhca(x)) | |||
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: | |||
out = self.norm(x) | |||
else: | |||
out = x | |||
x = x + self.mlp_path_dropout(self.mlp(out)) | |||
return x | |||
class E_MHSA(nn.Module): | |||
""" | |||
Efficient Multi-Head Self Attention | |||
""" | |||
def __init__(self, | |||
dim, | |||
out_dim=None, | |||
head_dim=32, | |||
qkv_bias=True, | |||
qk_scale=None, | |||
attn_drop=0, | |||
proj_drop=0., | |||
sr_ratio=1): | |||
super().__init__() | |||
self.dim = dim | |||
self.out_dim = out_dim if out_dim is not None else dim | |||
self.num_heads = self.dim // head_dim | |||
self.scale = qk_scale or head_dim**-0.5 | |||
self.q = nn.Linear(dim, self.dim, bias=qkv_bias) | |||
self.k = nn.Linear(dim, self.dim, bias=qkv_bias) | |||
self.v = nn.Linear(dim, self.dim, bias=qkv_bias) | |||
self.proj = nn.Linear(self.dim, self.out_dim) | |||
self.attn_drop = nn.Dropout(attn_drop) | |||
self.proj_drop = nn.Dropout(proj_drop) | |||
self.sr_ratio = sr_ratio | |||
self.N_ratio = sr_ratio**2 | |||
if sr_ratio > 1: | |||
self.sr = nn.AvgPool1d( | |||
kernel_size=self.N_ratio, stride=self.N_ratio) | |||
self.norm = nn.BatchNorm1d(dim, eps=NORM_EPS) | |||
self.is_bn_merge = False | |||
def forward(self, x): | |||
B, N, C = x.shape | |||
q = self.q(x) | |||
q = q.reshape(B, N, self.num_heads, | |||
int(C // self.num_heads)).permute(0, 2, 1, 3) | |||
if self.sr_ratio > 1: | |||
x_ = x.transpose(1, 2) | |||
x_ = self.sr(x_) | |||
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merge: | |||
x_ = self.norm(x_) | |||
x_ = x_.transpose(1, 2) | |||
k = self.k(x_) | |||
k = k.reshape(B, -1, self.num_heads, | |||
int(C // self.num_heads)).permute(0, 2, 3, 1) | |||
v = self.v(x_) | |||
v = v.reshape(B, -1, self.num_heads, | |||
int(C // self.num_heads)).permute(0, 2, 1, 3) | |||
else: | |||
k = self.k(x) | |||
k = k.reshape(B, -1, self.num_heads, | |||
int(C // self.num_heads)).permute(0, 2, 3, 1) | |||
v = self.v(x) | |||
v = v.reshape(B, -1, self.num_heads, | |||
int(C // self.num_heads)).permute(0, 2, 1, 3) | |||
attn = (q @ k) * self.scale | |||
attn = attn.softmax(dim=-1) | |||
attn = self.attn_drop(attn) | |||
x = (attn @ v).transpose(1, 2).reshape(B, N, C) | |||
x = self.proj(x) | |||
x = self.proj_drop(x) | |||
return x | |||
class NTB(nn.Module): | |||
""" | |||
Next Transformer Block | |||
""" | |||
def __init__( | |||
self, | |||
in_channels, | |||
out_channels, | |||
path_dropout, | |||
stride=1, | |||
sr_ratio=1, | |||
mlp_ratio=2, | |||
head_dim=32, | |||
mix_block_ratio=0.75, | |||
attn_drop=0, | |||
drop=0, | |||
): | |||
super(NTB, self).__init__() | |||
self.in_channels = in_channels | |||
self.out_channels = out_channels | |||
self.mix_block_ratio = mix_block_ratio | |||
norm_func = partial(nn.BatchNorm2d, eps=NORM_EPS) | |||
self.mhsa_out_channels = _make_divisible( | |||
int(out_channels * mix_block_ratio), 32) | |||
self.mhca_out_channels = out_channels - self.mhsa_out_channels | |||
self.patch_embed = PatchEmbed(in_channels, self.mhsa_out_channels, | |||
stride) | |||
self.norm1 = norm_func(self.mhsa_out_channels) | |||
self.e_mhsa = E_MHSA( | |||
self.mhsa_out_channels, | |||
head_dim=head_dim, | |||
sr_ratio=sr_ratio, | |||
attn_drop=attn_drop, | |||
proj_drop=drop) | |||
self.mhsa_path_dropout = DropPath(path_dropout * mix_block_ratio) | |||
self.projection = PatchEmbed( | |||
self.mhsa_out_channels, self.mhca_out_channels, stride=1) | |||
self.mhca = MHCA(self.mhca_out_channels, head_dim=head_dim) | |||
self.mhca_path_dropout = DropPath(path_dropout * (1 - mix_block_ratio)) | |||
self.norm2 = norm_func(out_channels) | |||
self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop) | |||
self.mlp_path_dropout = DropPath(path_dropout) | |||
self.is_bn_merged = False | |||
def forward(self, x): | |||
x = self.patch_embed(x) | |||
B, C, H, W = x.shape | |||
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: | |||
out = self.norm1(x) | |||
else: | |||
out = x | |||
out = rearrange(out, 'b c h w -> b (h w) c') # b n c | |||
out = self.mhsa_path_dropout(self.e_mhsa(out)) | |||
x = x + rearrange(out, 'b (h w) c -> b c h w', h=H) | |||
out = self.projection(x) | |||
out = out + self.mhca_path_dropout(self.mhca(out)) | |||
x = torch.cat([x, out], dim=1) | |||
if not torch.onnx.is_in_onnx_export() and not self.is_bn_merged: | |||
out = self.norm2(x) | |||
else: | |||
out = x | |||
x = x + self.mlp_path_dropout(self.mlp(out)) | |||
return x | |||
@BACKBONES.register_module() | |||
class NextViT(BaseBackbone): | |||
stem_chs = { | |||
'x_small': [64, 32, 64], | |||
'small': [64, 32, 64], | |||
'base': [64, 32, 64], | |||
'large': [64, 32, 64], | |||
} | |||
depths = { | |||
'x_small': [1, 1, 5, 1], | |||
'small': [3, 4, 10, 3], | |||
'base': [3, 4, 20, 3], | |||
'large': [3, 4, 30, 3], | |||
} | |||
def __init__(self, | |||
arch='small', | |||
path_dropout=0.2, | |||
attn_drop=0, | |||
drop=0, | |||
strides=[1, 2, 2, 2], | |||
sr_ratios=[8, 4, 2, 1], | |||
head_dim=32, | |||
mix_block_ratio=0.75, | |||
resume='', | |||
with_extra_norm=True, | |||
norm_eval=False, | |||
norm_cfg=None, | |||
out_indices=-1, | |||
frozen_stages=-1, | |||
init_cfg=None): | |||
super().__init__(init_cfg=init_cfg) | |||
stem_chs = self.stem_chs[arch] | |||
depths = self.depths[arch] | |||
self.frozen_stages = frozen_stages | |||
self.with_extra_norm = with_extra_norm | |||
self.norm_eval = norm_eval | |||
self.stage1_out_channels = [96] * (depths[0]) | |||
self.stage2_out_channels = [192] * (depths[1] - 1) + [256] | |||
self.stage3_out_channels = [384, 384, 384, 384, 512] * (depths[2] // 5) | |||
self.stage4_out_channels = [768] * (depths[3] - 1) + [1024] | |||
self.stage_out_channels = [ | |||
self.stage1_out_channels, self.stage2_out_channels, | |||
self.stage3_out_channels, self.stage4_out_channels | |||
] | |||
# Next Hybrid Strategy | |||
self.stage1_block_types = [NCB] * depths[0] | |||
self.stage2_block_types = [NCB] * (depths[1] - 1) + [NTB] | |||
self.stage3_block_types = [NCB, NCB, NCB, NCB, NTB] * (depths[2] // 5) | |||
self.stage4_block_types = [NCB] * (depths[3] - 1) + [NTB] | |||
self.stage_block_types = [ | |||
self.stage1_block_types, self.stage2_block_types, | |||
self.stage3_block_types, self.stage4_block_types | |||
] | |||
self.stem = nn.Sequential( | |||
ConvBNReLU(3, stem_chs[0], kernel_size=3, stride=2), | |||
ConvBNReLU(stem_chs[0], stem_chs[1], kernel_size=3, stride=1), | |||
ConvBNReLU(stem_chs[1], stem_chs[2], kernel_size=3, stride=1), | |||
ConvBNReLU(stem_chs[2], stem_chs[2], kernel_size=3, stride=2), | |||
) | |||
input_channel = stem_chs[-1] | |||
features = [] | |||
idx = 0 | |||
dpr = [x.item() for x in torch.linspace(0, path_dropout, sum(depths)) | |||
] # stochastic depth decay rule | |||
for stage_id in range(len(depths)): | |||
numrepeat = depths[stage_id] | |||
output_channels = self.stage_out_channels[stage_id] | |||
block_types = self.stage_block_types[stage_id] | |||
for block_id in range(numrepeat): | |||
if strides[stage_id] == 2 and block_id == 0: | |||
stride = 2 | |||
else: | |||
stride = 1 | |||
output_channel = output_channels[block_id] | |||
block_type = block_types[block_id] | |||
if block_type is NCB: | |||
layer = NCB( | |||
input_channel, | |||
output_channel, | |||
stride=stride, | |||
path_dropout=dpr[idx + block_id], | |||
drop=drop, | |||
head_dim=head_dim) | |||
features.append(layer) | |||
elif block_type is NTB: | |||
layer = NTB( | |||
input_channel, | |||
output_channel, | |||
path_dropout=dpr[idx + block_id], | |||
stride=stride, | |||
sr_ratio=sr_ratios[stage_id], | |||
head_dim=head_dim, | |||
mix_block_ratio=mix_block_ratio, | |||
attn_drop=attn_drop, | |||
drop=drop) | |||
features.append(layer) | |||
input_channel = output_channel | |||
idx += numrepeat | |||
self.features = nn.Sequential(*features) | |||
self.norm = nn.BatchNorm2d(output_channel, eps=NORM_EPS) | |||
if isinstance(out_indices, int): | |||
out_indices = [out_indices] | |||
assert isinstance(out_indices, Sequence), \ | |||
f'"out_indices" must by a sequence or int, ' \ | |||
f'get {type(out_indices)} instead.' | |||
for i, index in enumerate(out_indices): | |||
if index < 0: | |||
out_indices[i] = sum(depths) + index | |||
assert out_indices[i] >= 0, f'Invalid out_indices {index}' | |||
self.stage_out_idx = out_indices | |||
if norm_cfg is not None: | |||
self = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self) | |||
def init_weights(self): | |||
super(NextViT, self).init_weights() | |||
if (isinstance(self.init_cfg, dict) | |||
and self.init_cfg['type'] == 'Pretrained'): | |||
# Suppress default init if use pretrained model. | |||
return | |||
self._initialize_weights() | |||
def _initialize_weights(self): | |||
for n, m in self.named_modules(): | |||
if isinstance(m, (nn.BatchNorm2d, | |||
nn.BatchNorm1d)): # nn.GroupNorm, nn.LayerNorm, | |||
nn.init.constant_(m.weight, 1.0) | |||
nn.init.constant_(m.bias, 0) | |||
elif isinstance(m, nn.Linear): | |||
trunc_normal_(m.weight, std=.02) | |||
if hasattr(m, 'bias') and m.bias is not None: | |||
nn.init.constant_(m.bias, 0) | |||
elif isinstance(m, nn.Conv2d): | |||
trunc_normal_(m.weight, std=.02) | |||
if hasattr(m, 'bias') and m.bias is not None: | |||
nn.init.constant_(m.bias, 0) | |||
def forward(self, x): | |||
outputs = list() | |||
x = self.stem(x) | |||
stage_id = 0 | |||
for idx, layer in enumerate(self.features): | |||
x = layer(x) | |||
if idx == self.stage_out_idx[stage_id]: | |||
if self.with_extra_norm: | |||
x = self.norm(x) | |||
outputs.append(x) | |||
stage_id += 1 | |||
return tuple(outputs) | |||
def _freeze_stages(self): | |||
if self.frozen_stages > 0: | |||
self.stem.eval() | |||
for param in self.stem.parameters(): | |||
param.requires_grad = False | |||
for idx, layer in enumerate(self.features): | |||
if idx <= self.stage_out_idx[self.frozen_stages - 1]: | |||
layer.eval() | |||
for param in layer.parameters(): | |||
param.requires_grad = False | |||
def train(self, mode=True): | |||
super(NextViT, self).train(mode) | |||
self._freeze_stages() | |||
if mode and self.norm_eval: | |||
for m in self.modules(): | |||
# trick: eval have effect on BatchNorm only | |||
if isinstance(m, _BatchNorm): | |||
m.eval() |
@@ -1,9 +1,10 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
@MODELS.register_module( | |||
@@ -13,16 +14,25 @@ class ClassificationModel(TorchModel): | |||
def __init__(self, model_dir: str, **kwargs): | |||
import mmcv | |||
from mmcls.models import build_classifier | |||
import modelscope.models.cv.image_classification.backbones | |||
from modelscope.utils.hub import read_config | |||
super().__init__(model_dir) | |||
config = os.path.join(model_dir, 'config.py') | |||
cfg = mmcv.Config.fromfile(config) | |||
cfg.model.pretrained = None | |||
self.cls_model = build_classifier(cfg.model) | |||
self.config_type = 'ms_config' | |||
mm_config = os.path.join(model_dir, 'config.py') | |||
if os.path.exists(mm_config): | |||
cfg = mmcv.Config.fromfile(mm_config) | |||
cfg.model.pretrained = None | |||
self.cls_model = build_classifier(cfg.model) | |||
self.config_type = 'mmcv_config' | |||
else: | |||
cfg = read_config(model_dir) | |||
cfg.model.mm_model.pretrained = None | |||
self.cls_model = build_classifier(cfg.model.mm_model) | |||
self.config_type = 'ms_config' | |||
self.cfg = cfg | |||
self.ms_model_dir = model_dir | |||
self.load_pretrained_checkpoint() | |||
@@ -33,7 +43,13 @@ class ClassificationModel(TorchModel): | |||
def load_pretrained_checkpoint(self): | |||
import mmcv | |||
checkpoint_path = os.path.join(self.ms_model_dir, 'checkpoints.pth') | |||
if os.path.exists( | |||
os.path.join(self.ms_model_dir, ModelFile.TORCH_MODEL_FILE)): | |||
checkpoint_path = os.path.join(self.ms_model_dir, | |||
ModelFile.TORCH_MODEL_FILE) | |||
else: | |||
checkpoint_path = os.path.join(self.ms_model_dir, | |||
'checkpoints.pth') | |||
if os.path.exists(checkpoint_path): | |||
checkpoint = mmcv.runner.load_checkpoint( | |||
self.cls_model, checkpoint_path, map_location='cpu') | |||
@@ -0,0 +1,100 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os.path as osp | |||
import numpy as np | |||
from mmcls.datasets.base_dataset import BaseDataset | |||
def get_trained_checkpoints_name(work_path): | |||
import os | |||
file_list = os.listdir(work_path) | |||
last = 0 | |||
model_name = None | |||
# find the best model | |||
if model_name is None: | |||
for f_name in file_list: | |||
if 'best_' in f_name and f_name.endswith('.pth'): | |||
best_epoch = f_name.replace('.pth', '').split('_')[-1] | |||
if best_epoch.isdigit(): | |||
last = int(best_epoch) | |||
model_name = f_name | |||
return model_name | |||
# or find the latest model | |||
if model_name is None: | |||
for f_name in file_list: | |||
if 'epoch_' in f_name and f_name.endswith('.pth'): | |||
epoch_num = f_name.replace('epoch_', '').replace('.pth', '') | |||
if not epoch_num.isdigit(): | |||
continue | |||
ind = int(epoch_num) | |||
if ind > last: | |||
last = ind | |||
model_name = f_name | |||
return model_name | |||
def preprocess_transform(cfgs): | |||
if cfgs is None: | |||
return None | |||
for i, cfg in enumerate(cfgs): | |||
if cfg.type == 'Resize': | |||
if isinstance(cfg.size, list): | |||
cfgs[i].size = tuple(cfg.size) | |||
return cfgs | |||
def get_ms_dataset_root(ms_dataset): | |||
if ms_dataset is None or len(ms_dataset) < 1: | |||
return None | |||
try: | |||
data_root = ms_dataset[0]['image:FILE'].split('extracted')[0] | |||
path_post = ms_dataset[0]['image:FILE'].split('extracted')[1].split( | |||
'/') | |||
extracted_data_root = osp.join(data_root, 'extracted', path_post[1], | |||
path_post[2]) | |||
return extracted_data_root | |||
except Exception as e: | |||
raise ValueError(f'Dataset Error: {e}') | |||
return None | |||
def get_classes(classes=None): | |||
import mmcv | |||
if isinstance(classes, str): | |||
# take it as a file path | |||
class_names = mmcv.list_from_file(classes) | |||
elif isinstance(classes, (tuple, list)): | |||
class_names = classes | |||
else: | |||
raise ValueError(f'Unsupported type {type(classes)} of classes.') | |||
return class_names | |||
class MmDataset(BaseDataset): | |||
def __init__(self, ms_dataset, pipeline, classes=None, test_mode=False): | |||
self.ms_dataset = ms_dataset | |||
if len(self.ms_dataset) < 1: | |||
raise ValueError('Dataset Error: dataset is empty') | |||
super(MmDataset, self).__init__( | |||
data_prefix='', | |||
pipeline=pipeline, | |||
classes=classes, | |||
test_mode=test_mode) | |||
def load_annotations(self): | |||
if self.CLASSES is None: | |||
raise ValueError( | |||
f'Dataset Error: Not found classesname.txt: {self.CLASSES}') | |||
data_infos = [] | |||
for data_info in self.ms_dataset: | |||
filename = data_info['image:FILE'] | |||
gt_label = data_info['category'] | |||
info = {'img_prefix': self.data_prefix} | |||
info['img_info'] = {'filename': filename} | |||
info['gt_label'] = np.array(gt_label, dtype=np.int64) | |||
data_infos.append(info) | |||
return data_infos |
@@ -0,0 +1 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. |
@@ -0,0 +1 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. |
@@ -0,0 +1,215 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from .newcrf_layers import NewCRF | |||
from .swin_transformer import SwinTransformer | |||
from .uper_crf_head import PSP | |||
class NewCRFDepth(nn.Module): | |||
""" | |||
Depth network based on neural window FC-CRFs architecture. | |||
""" | |||
def __init__(self, | |||
version=None, | |||
inv_depth=False, | |||
pretrained=None, | |||
frozen_stages=-1, | |||
min_depth=0.1, | |||
max_depth=100.0, | |||
**kwargs): | |||
super().__init__() | |||
self.inv_depth = inv_depth | |||
self.with_auxiliary_head = False | |||
self.with_neck = False | |||
norm_cfg = dict(type='BN', requires_grad=True) | |||
# norm_cfg = dict(type='GN', requires_grad=True, num_groups=8) | |||
window_size = int(version[-2:]) | |||
if version[:-2] == 'base': | |||
embed_dim = 128 | |||
depths = [2, 2, 18, 2] | |||
num_heads = [4, 8, 16, 32] | |||
in_channels = [128, 256, 512, 1024] | |||
elif version[:-2] == 'large': | |||
embed_dim = 192 | |||
depths = [2, 2, 18, 2] | |||
num_heads = [6, 12, 24, 48] | |||
in_channels = [192, 384, 768, 1536] | |||
elif version[:-2] == 'tiny': | |||
embed_dim = 96 | |||
depths = [2, 2, 6, 2] | |||
num_heads = [3, 6, 12, 24] | |||
in_channels = [96, 192, 384, 768] | |||
backbone_cfg = dict( | |||
embed_dim=embed_dim, | |||
depths=depths, | |||
num_heads=num_heads, | |||
window_size=window_size, | |||
ape=False, | |||
drop_path_rate=0.3, | |||
patch_norm=True, | |||
use_checkpoint=False, | |||
frozen_stages=frozen_stages) | |||
embed_dim = 512 | |||
decoder_cfg = dict( | |||
in_channels=in_channels, | |||
in_index=[0, 1, 2, 3], | |||
pool_scales=(1, 2, 3, 6), | |||
channels=embed_dim, | |||
dropout_ratio=0.0, | |||
num_classes=32, | |||
norm_cfg=norm_cfg, | |||
align_corners=False) | |||
self.backbone = SwinTransformer(**backbone_cfg) | |||
# v_dim = decoder_cfg['num_classes'] * 4 | |||
win = 7 | |||
crf_dims = [128, 256, 512, 1024] | |||
v_dims = [64, 128, 256, embed_dim] | |||
self.crf3 = NewCRF( | |||
input_dim=in_channels[3], | |||
embed_dim=crf_dims[3], | |||
window_size=win, | |||
v_dim=v_dims[3], | |||
num_heads=32) | |||
self.crf2 = NewCRF( | |||
input_dim=in_channels[2], | |||
embed_dim=crf_dims[2], | |||
window_size=win, | |||
v_dim=v_dims[2], | |||
num_heads=16) | |||
self.crf1 = NewCRF( | |||
input_dim=in_channels[1], | |||
embed_dim=crf_dims[1], | |||
window_size=win, | |||
v_dim=v_dims[1], | |||
num_heads=8) | |||
self.crf0 = NewCRF( | |||
input_dim=in_channels[0], | |||
embed_dim=crf_dims[0], | |||
window_size=win, | |||
v_dim=v_dims[0], | |||
num_heads=4) | |||
self.decoder = PSP(**decoder_cfg) | |||
self.disp_head1 = DispHead(input_dim=crf_dims[0]) | |||
self.up_mode = 'bilinear' | |||
if self.up_mode == 'mask': | |||
self.mask_head = nn.Sequential( | |||
nn.Conv2d(crf_dims[0], 64, 3, padding=1), | |||
nn.ReLU(inplace=True), nn.Conv2d(64, 16 * 9, 1, padding=0)) | |||
self.min_depth = min_depth | |||
self.max_depth = max_depth | |||
self.init_weights(pretrained=pretrained) | |||
def init_weights(self, pretrained=None): | |||
"""Initialize the weights in backbone and heads. | |||
Args: | |||
pretrained (str, optional): Path to pre-trained weights. | |||
Defaults to None. | |||
""" | |||
# print(f'== Load encoder backbone from: {pretrained}') | |||
self.backbone.init_weights(pretrained=pretrained) | |||
self.decoder.init_weights() | |||
if self.with_auxiliary_head: | |||
if isinstance(self.auxiliary_head, nn.ModuleList): | |||
for aux_head in self.auxiliary_head: | |||
aux_head.init_weights() | |||
else: | |||
self.auxiliary_head.init_weights() | |||
def upsample_mask(self, disp, mask): | |||
""" Upsample disp [H/4, W/4, 1] -> [H, W, 1] using convex combination """ | |||
N, _, H, W = disp.shape | |||
mask = mask.view(N, 1, 9, 4, 4, H, W) | |||
mask = torch.softmax(mask, dim=2) | |||
up_disp = F.unfold(disp, kernel_size=3, padding=1) | |||
up_disp = up_disp.view(N, 1, 9, 1, 1, H, W) | |||
up_disp = torch.sum(mask * up_disp, dim=2) | |||
up_disp = up_disp.permute(0, 1, 4, 2, 5, 3) | |||
return up_disp.reshape(N, 1, 4 * H, 4 * W) | |||
def forward(self, imgs): | |||
feats = self.backbone(imgs) | |||
if self.with_neck: | |||
feats = self.neck(feats) | |||
ppm_out = self.decoder(feats) | |||
e3 = self.crf3(feats[3], ppm_out) | |||
e3 = nn.PixelShuffle(2)(e3) | |||
e2 = self.crf2(feats[2], e3) | |||
e2 = nn.PixelShuffle(2)(e2) | |||
e1 = self.crf1(feats[1], e2) | |||
e1 = nn.PixelShuffle(2)(e1) | |||
e0 = self.crf0(feats[0], e1) | |||
if self.up_mode == 'mask': | |||
mask = self.mask_head(e0) | |||
d1 = self.disp_head1(e0, 1) | |||
d1 = self.upsample_mask(d1, mask) | |||
else: | |||
d1 = self.disp_head1(e0, 4) | |||
depth = d1 * self.max_depth | |||
return depth | |||
class DispHead(nn.Module): | |||
def __init__(self, input_dim=100): | |||
super(DispHead, self).__init__() | |||
# self.norm1 = nn.BatchNorm2d(input_dim) | |||
self.conv1 = nn.Conv2d(input_dim, 1, 3, padding=1) | |||
# self.relu = nn.ReLU(inplace=True) | |||
self.sigmoid = nn.Sigmoid() | |||
def forward(self, x, scale): | |||
# x = self.relu(self.norm1(x)) | |||
x = self.sigmoid(self.conv1(x)) | |||
if scale > 1: | |||
x = upsample(x, scale_factor=scale) | |||
return x | |||
class DispUnpack(nn.Module): | |||
def __init__(self, input_dim=100, hidden_dim=128): | |||
super(DispUnpack, self).__init__() | |||
self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) | |||
self.conv2 = nn.Conv2d(hidden_dim, 16, 3, padding=1) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.sigmoid = nn.Sigmoid() | |||
self.pixel_shuffle = nn.PixelShuffle(4) | |||
def forward(self, x, output_size): | |||
x = self.relu(self.conv1(x)) | |||
x = self.sigmoid(self.conv2(x)) # [b, 16, h/4, w/4] | |||
# x = torch.reshape(x, [x.shape[0], 1, x.shape[2]*4, x.shape[3]*4]) | |||
x = self.pixel_shuffle(x) | |||
return x | |||
def upsample(x, scale_factor=2, mode='bilinear', align_corners=False): | |||
"""Upsample input tensor by a factor of 2 | |||
""" | |||
return F.interpolate( | |||
x, scale_factor=scale_factor, mode=mode, align_corners=align_corners) |
@@ -0,0 +1,504 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torch.utils.checkpoint as checkpoint | |||
from timm.models.layers import DropPath, to_2tuple, trunc_normal_ | |||
class Mlp(nn.Module): | |||
""" Multilayer perceptron.""" | |||
def __init__(self, | |||
in_features, | |||
hidden_features=None, | |||
out_features=None, | |||
act_layer=nn.GELU, | |||
drop=0.): | |||
super().__init__() | |||
out_features = out_features or in_features | |||
hidden_features = hidden_features or in_features | |||
self.fc1 = nn.Linear(in_features, hidden_features) | |||
self.act = act_layer() | |||
self.fc2 = nn.Linear(hidden_features, out_features) | |||
self.drop = nn.Dropout(drop) | |||
def forward(self, x): | |||
x = self.fc1(x) | |||
x = self.act(x) | |||
x = self.drop(x) | |||
x = self.fc2(x) | |||
x = self.drop(x) | |||
return x | |||
def window_partition(x, window_size): | |||
""" | |||
Args: | |||
x: (B, H, W, C) | |||
window_size (int): window size | |||
Returns: | |||
windows: (num_windows*B, window_size, window_size, C) | |||
""" | |||
B, H, W, C = x.shape | |||
x = x.view(B, H // window_size, window_size, W // window_size, window_size, | |||
C) | |||
windows = x.permute(0, 1, 3, 2, 4, | |||
5).contiguous().view(-1, window_size, window_size, C) | |||
return windows | |||
def window_reverse(windows, window_size, H, W): | |||
""" | |||
Args: | |||
windows: (num_windows*B, window_size, window_size, C) | |||
window_size (int): Window size | |||
H (int): Height of image | |||
W (int): Width of image | |||
Returns: | |||
x: (B, H, W, C) | |||
""" | |||
B = int(windows.shape[0] / (H * W / window_size / window_size)) | |||
x = windows.view(B, H // window_size, W // window_size, window_size, | |||
window_size, -1) | |||
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) | |||
return x | |||
class WindowAttention(nn.Module): | |||
""" Window based multi-head self attention (W-MSA) module with relative position bias. | |||
It supports both of shifted and non-shifted window. | |||
Args: | |||
dim (int): Number of input channels. | |||
window_size (tuple[int]): The height and width of the window. | |||
num_heads (int): Number of attention heads. | |||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set | |||
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 | |||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0 | |||
""" | |||
def __init__(self, | |||
dim, | |||
window_size, | |||
num_heads, | |||
v_dim, | |||
qkv_bias=True, | |||
qk_scale=None, | |||
attn_drop=0., | |||
proj_drop=0.): | |||
super().__init__() | |||
self.dim = dim | |||
self.window_size = window_size # Wh, Ww | |||
self.num_heads = num_heads | |||
head_dim = dim // num_heads | |||
self.scale = qk_scale or head_dim**-0.5 | |||
# define a parameter table of relative position bias | |||
self.relative_position_bias_table = nn.Parameter( | |||
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), | |||
num_heads)) # 2*Wh-1 * 2*Ww-1, nH | |||
# get pair-wise relative position index for each token inside the window | |||
coords_h = torch.arange(self.window_size[0]) | |||
coords_w = torch.arange(self.window_size[1]) | |||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww | |||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww | |||
relative_coords = coords_flatten[:, :, | |||
None] - coords_flatten[:, | |||
None, :] # 2, Wh*Ww, Wh*Ww | |||
relative_coords = relative_coords.permute( | |||
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 | |||
relative_coords[:, :, | |||
0] += self.window_size[0] - 1 # shift to start from 0 | |||
relative_coords[:, :, 1] += self.window_size[1] - 1 | |||
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 | |||
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww | |||
self.register_buffer('relative_position_index', | |||
relative_position_index) | |||
self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias) | |||
self.attn_drop = nn.Dropout(attn_drop) | |||
self.proj = nn.Linear(v_dim, v_dim) | |||
self.proj_drop = nn.Dropout(proj_drop) | |||
trunc_normal_(self.relative_position_bias_table, std=.02) | |||
self.softmax = nn.Softmax(dim=-1) | |||
def forward(self, x, v, mask=None): | |||
""" Forward function. | |||
Args: | |||
x: input features with shape of (num_windows*B, N, C) | |||
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None | |||
""" | |||
B_, N, C = x.shape | |||
qk = self.qk(x).reshape(B_, N, 2, self.num_heads, | |||
C // self.num_heads).permute(2, 0, 3, 1, 4) | |||
q, k = qk[0], qk[ | |||
1] # make torchscript happy (cannot use tensor as tuple) | |||
q = q * self.scale | |||
attn = (q @ k.transpose(-2, -1)) | |||
relative_position_bias = self.relative_position_bias_table[ | |||
self.relative_position_index.view(-1)].view( | |||
self.window_size[0] * self.window_size[1], | |||
self.window_size[0] * self.window_size[1], | |||
-1) # Wh*Ww,Wh*Ww,nH | |||
relative_position_bias = relative_position_bias.permute( | |||
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww | |||
attn = attn + relative_position_bias.unsqueeze(0) | |||
if mask is not None: | |||
nW = mask.shape[0] | |||
attn = attn.view(B_ // nW, nW, self.num_heads, N, | |||
N) + mask.unsqueeze(1).unsqueeze(0) | |||
attn = attn.view(-1, self.num_heads, N, N) | |||
attn = self.softmax(attn) | |||
else: | |||
attn = self.softmax(attn) | |||
attn = self.attn_drop(attn) | |||
# assert self.dim % v.shape[-1] == 0, "self.dim % v.shape[-1] != 0" | |||
# repeat_num = self.dim // v.shape[-1] | |||
# v = v.view(B_, N, self.num_heads // repeat_num, -1).transpose(1, 2).repeat(1, repeat_num, 1, 1) | |||
assert self.dim == v.shape[-1], 'self.dim != v.shape[-1]' | |||
v = v.view(B_, N, self.num_heads, -1).transpose(1, 2) | |||
x = (attn @ v).transpose(1, 2).reshape(B_, N, C) | |||
x = self.proj(x) | |||
x = self.proj_drop(x) | |||
return x | |||
class CRFBlock(nn.Module): | |||
""" CRF Block. | |||
Args: | |||
dim (int): Number of input channels. | |||
num_heads (int): Number of attention heads. | |||
window_size (int): Window size. | |||
shift_size (int): Shift size for SW-MSA. | |||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. | |||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. | |||
drop (float, optional): Dropout rate. Default: 0.0 | |||
attn_drop (float, optional): Attention dropout rate. Default: 0.0 | |||
drop_path (float, optional): Stochastic depth rate. Default: 0.0 | |||
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU | |||
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm | |||
""" | |||
def __init__(self, | |||
dim, | |||
num_heads, | |||
v_dim, | |||
window_size=7, | |||
shift_size=0, | |||
mlp_ratio=4., | |||
qkv_bias=True, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
act_layer=nn.GELU, | |||
norm_layer=nn.LayerNorm): | |||
super().__init__() | |||
self.dim = dim | |||
self.num_heads = num_heads | |||
self.v_dim = v_dim | |||
self.window_size = window_size | |||
self.shift_size = shift_size | |||
self.mlp_ratio = mlp_ratio | |||
assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size' | |||
self.norm1 = norm_layer(dim) | |||
self.attn = WindowAttention( | |||
dim, | |||
window_size=to_2tuple(self.window_size), | |||
num_heads=num_heads, | |||
v_dim=v_dim, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
attn_drop=attn_drop, | |||
proj_drop=drop) | |||
self.drop_path = DropPath( | |||
drop_path) if drop_path > 0. else nn.Identity() | |||
self.norm2 = norm_layer(v_dim) | |||
mlp_hidden_dim = int(v_dim * mlp_ratio) | |||
self.mlp = Mlp( | |||
in_features=v_dim, | |||
hidden_features=mlp_hidden_dim, | |||
act_layer=act_layer, | |||
drop=drop) | |||
self.H = None | |||
self.W = None | |||
def forward(self, x, v, mask_matrix): | |||
""" Forward function. | |||
Args: | |||
x: Input feature, tensor size (B, H*W, C). | |||
H, W: Spatial resolution of the input feature. | |||
mask_matrix: Attention mask for cyclic shift. | |||
""" | |||
B, L, C = x.shape | |||
H, W = self.H, self.W | |||
assert L == H * W, 'input feature has wrong size' | |||
shortcut = x | |||
x = self.norm1(x) | |||
x = x.view(B, H, W, C) | |||
# pad feature maps to multiples of window size | |||
pad_l = pad_t = 0 | |||
pad_r = (self.window_size - W % self.window_size) % self.window_size | |||
pad_b = (self.window_size - H % self.window_size) % self.window_size | |||
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) | |||
v = F.pad(v, (0, 0, pad_l, pad_r, pad_t, pad_b)) | |||
_, Hp, Wp, _ = x.shape | |||
# cyclic shift | |||
if self.shift_size > 0: | |||
shifted_x = torch.roll( | |||
x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) | |||
shifted_v = torch.roll( | |||
v, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) | |||
attn_mask = mask_matrix | |||
else: | |||
shifted_x = x | |||
shifted_v = v | |||
attn_mask = None | |||
# partition windows | |||
x_windows = window_partition( | |||
shifted_x, self.window_size) # nW*B, window_size, window_size, C | |||
x_windows = x_windows.view(-1, self.window_size * self.window_size, | |||
C) # nW*B, window_size*window_size, C | |||
v_windows = window_partition( | |||
shifted_v, self.window_size) # nW*B, window_size, window_size, C | |||
v_windows = v_windows.view( | |||
-1, self.window_size * self.window_size, | |||
v_windows.shape[-1]) # nW*B, window_size*window_size, C | |||
# W-MSA/SW-MSA | |||
attn_windows = self.attn( | |||
x_windows, v_windows, | |||
mask=attn_mask) # nW*B, window_size*window_size, C | |||
# merge windows | |||
attn_windows = attn_windows.view(-1, self.window_size, | |||
self.window_size, self.v_dim) | |||
shifted_x = window_reverse(attn_windows, self.window_size, Hp, | |||
Wp) # B H' W' C | |||
# reverse cyclic shift | |||
if self.shift_size > 0: | |||
x = torch.roll( | |||
shifted_x, | |||
shifts=(self.shift_size, self.shift_size), | |||
dims=(1, 2)) | |||
else: | |||
x = shifted_x | |||
if pad_r > 0 or pad_b > 0: | |||
x = x[:, :H, :W, :].contiguous() | |||
x = x.view(B, H * W, self.v_dim) | |||
# FFN | |||
x = shortcut + self.drop_path(x) | |||
x = x + self.drop_path(self.mlp(self.norm2(x))) | |||
return x | |||
class BasicCRFLayer(nn.Module): | |||
""" A basic NeWCRFs layer for one stage. | |||
Args: | |||
dim (int): Number of feature channels | |||
depth (int): Depths of this stage. | |||
num_heads (int): Number of attention head. | |||
window_size (int): Local window size. Default: 7. | |||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. | |||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. | |||
drop (float, optional): Dropout rate. Default: 0.0 | |||
attn_drop (float, optional): Attention dropout rate. Default: 0.0 | |||
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 | |||
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm | |||
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None | |||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. | |||
""" | |||
def __init__(self, | |||
dim, | |||
depth, | |||
num_heads, | |||
v_dim, | |||
window_size=7, | |||
mlp_ratio=4., | |||
qkv_bias=True, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
norm_layer=nn.LayerNorm, | |||
downsample=None, | |||
use_checkpoint=False): | |||
super().__init__() | |||
self.window_size = window_size | |||
self.shift_size = window_size // 2 | |||
self.depth = depth | |||
self.use_checkpoint = use_checkpoint | |||
# build blocks | |||
self.blocks = nn.ModuleList([ | |||
CRFBlock( | |||
dim=dim, | |||
num_heads=num_heads, | |||
v_dim=v_dim, | |||
window_size=window_size, | |||
shift_size=0 if (i % 2 == 0) else window_size // 2, | |||
mlp_ratio=mlp_ratio, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
drop=drop, | |||
attn_drop=attn_drop, | |||
drop_path=drop_path[i] | |||
if isinstance(drop_path, list) else drop_path, | |||
norm_layer=norm_layer) for i in range(depth) | |||
]) | |||
# patch merging layer | |||
if downsample is not None: | |||
self.downsample = downsample(dim=dim, norm_layer=norm_layer) | |||
else: | |||
self.downsample = None | |||
def forward(self, x, v, H, W): | |||
""" Forward function. | |||
Args: | |||
x: Input feature, tensor size (B, H*W, C). | |||
H, W: Spatial resolution of the input feature. | |||
""" | |||
# calculate attention mask for SW-MSA | |||
Hp = int(np.ceil(H / self.window_size)) * self.window_size | |||
Wp = int(np.ceil(W / self.window_size)) * self.window_size | |||
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 | |||
h_slices = (slice(0, -self.window_size), | |||
slice(-self.window_size, | |||
-self.shift_size), slice(-self.shift_size, None)) | |||
w_slices = (slice(0, -self.window_size), | |||
slice(-self.window_size, | |||
-self.shift_size), slice(-self.shift_size, None)) | |||
cnt = 0 | |||
for h in h_slices: | |||
for w in w_slices: | |||
img_mask[:, h, w, :] = cnt | |||
cnt += 1 | |||
mask_windows = window_partition( | |||
img_mask, self.window_size) # nW, window_size, window_size, 1 | |||
mask_windows = mask_windows.view(-1, | |||
self.window_size * self.window_size) | |||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) | |||
attn_mask = attn_mask.masked_fill(attn_mask != 0, | |||
float(-100.0)).masked_fill( | |||
attn_mask == 0, float(0.0)) | |||
for blk in self.blocks: | |||
blk.H, blk.W = H, W | |||
if self.use_checkpoint: | |||
x = checkpoint.checkpoint(blk, x, attn_mask) | |||
else: | |||
x = blk(x, v, attn_mask) | |||
if self.downsample is not None: | |||
x_down = self.downsample(x, H, W) | |||
Wh, Ww = (H + 1) // 2, (W + 1) // 2 | |||
return x, H, W, x_down, Wh, Ww | |||
else: | |||
return x, H, W, x, H, W | |||
class NewCRF(nn.Module): | |||
def __init__(self, | |||
input_dim=96, | |||
embed_dim=96, | |||
v_dim=64, | |||
window_size=7, | |||
num_heads=4, | |||
depth=2, | |||
patch_size=4, | |||
in_chans=3, | |||
norm_layer=nn.LayerNorm, | |||
patch_norm=True): | |||
super().__init__() | |||
self.embed_dim = embed_dim | |||
self.patch_norm = patch_norm | |||
if input_dim != embed_dim: | |||
self.proj_x = nn.Conv2d(input_dim, embed_dim, 3, padding=1) | |||
else: | |||
self.proj_x = None | |||
if v_dim != embed_dim: | |||
self.proj_v = nn.Conv2d(v_dim, embed_dim, 3, padding=1) | |||
elif embed_dim % v_dim == 0: | |||
self.proj_v = None | |||
v_dim = embed_dim | |||
assert v_dim == embed_dim | |||
self.crf_layer = BasicCRFLayer( | |||
dim=embed_dim, | |||
depth=depth, | |||
num_heads=num_heads, | |||
v_dim=v_dim, | |||
window_size=window_size, | |||
mlp_ratio=4., | |||
qkv_bias=True, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
norm_layer=norm_layer, | |||
downsample=None, | |||
use_checkpoint=False) | |||
layer = norm_layer(embed_dim) | |||
layer_name = 'norm_crf' | |||
self.add_module(layer_name, layer) | |||
def forward(self, x, v): | |||
if self.proj_x is not None: | |||
x = self.proj_x(x) | |||
if self.proj_v is not None: | |||
v = self.proj_v(v) | |||
Wh, Ww = x.size(2), x.size(3) | |||
x = x.flatten(2).transpose(1, 2) | |||
v = v.transpose(1, 2).transpose(2, 3) | |||
x_out, H, W, x, Wh, Ww = self.crf_layer(x, v, Wh, Ww) | |||
norm_layer = getattr(self, 'norm_crf') | |||
x_out = norm_layer(x_out) | |||
out = x_out.view(-1, H, W, self.embed_dim).permute(0, 3, 1, | |||
2).contiguous() | |||
return out |
@@ -0,0 +1,272 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import os.path as osp | |||
import pkgutil | |||
import warnings | |||
from collections import OrderedDict | |||
from importlib import import_module | |||
import torch | |||
import torch.nn as nn | |||
import torchvision | |||
from torch import distributed as dist | |||
from torch.nn import functional as F | |||
from torch.nn.parallel import DataParallel, DistributedDataParallel | |||
from torch.utils import model_zoo | |||
TORCH_VERSION = torch.__version__ | |||
def resize(input, | |||
size=None, | |||
scale_factor=None, | |||
mode='nearest', | |||
align_corners=None, | |||
warning=True): | |||
if warning: | |||
if size is not None and align_corners: | |||
input_h, input_w = tuple(int(x) for x in input.shape[2:]) | |||
output_h, output_w = tuple(int(x) for x in size) | |||
if output_h > input_h or output_w > output_h: | |||
if ((output_h > 1 and output_w > 1 and input_h > 1 | |||
and input_w > 1) and (output_h - 1) % (input_h - 1) | |||
and (output_w - 1) % (input_w - 1)): | |||
warnings.warn( | |||
f'When align_corners={align_corners}, ' | |||
'the output would more aligned if ' | |||
f'input size {(input_h, input_w)} is `x+1` and ' | |||
f'out size {(output_h, output_w)} is `nx+1`') | |||
if isinstance(size, torch.Size): | |||
size = tuple(int(x) for x in size) | |||
return F.interpolate(input, size, scale_factor, mode, align_corners) | |||
def normal_init(module, mean=0, std=1, bias=0): | |||
if hasattr(module, 'weight') and module.weight is not None: | |||
nn.init.normal_(module.weight, mean, std) | |||
if hasattr(module, 'bias') and module.bias is not None: | |||
nn.init.constant_(module.bias, bias) | |||
def is_module_wrapper(module): | |||
module_wrappers = (DataParallel, DistributedDataParallel) | |||
return isinstance(module, module_wrappers) | |||
def get_dist_info(): | |||
if TORCH_VERSION < '1.0': | |||
initialized = dist._initialized | |||
else: | |||
if dist.is_available(): | |||
initialized = dist.is_initialized() | |||
else: | |||
initialized = False | |||
if initialized: | |||
rank = dist.get_rank() | |||
world_size = dist.get_world_size() | |||
else: | |||
rank = 0 | |||
world_size = 1 | |||
return rank, world_size | |||
def load_state_dict(module, state_dict, strict=False, logger=None): | |||
"""Load state_dict to a module. | |||
This method is modified from :meth:`torch.nn.Module.load_state_dict`. | |||
Default value for ``strict`` is set to ``False`` and the message for | |||
param mismatch will be shown even if strict is False. | |||
Args: | |||
module (Module): Module that receives the state_dict. | |||
state_dict (OrderedDict): Weights. | |||
strict (bool): whether to strictly enforce that the keys | |||
in :attr:`state_dict` match the keys returned by this module's | |||
:meth:`~torch.nn.Module.state_dict` function. Default: ``False``. | |||
logger (:obj:`logging.Logger`, optional): Logger to log the error | |||
message. If not specified, print function will be used. | |||
""" | |||
unexpected_keys = [] | |||
all_missing_keys = [] | |||
err_msg = [] | |||
metadata = getattr(state_dict, '_metadata', None) | |||
state_dict = state_dict.copy() | |||
if metadata is not None: | |||
state_dict._metadata = metadata | |||
# use _load_from_state_dict to enable checkpoint version control | |||
def load(module, prefix=''): | |||
# recursively check parallel module in case that the model has a | |||
# complicated structure, e.g., nn.Module(nn.Module(DDP)) | |||
if is_module_wrapper(module): | |||
module = module.module | |||
local_metadata = {} if metadata is None else metadata.get( | |||
prefix[:-1], {}) | |||
module._load_from_state_dict(state_dict, prefix, local_metadata, True, | |||
all_missing_keys, unexpected_keys, | |||
err_msg) | |||
for name, child in module._modules.items(): | |||
if child is not None: | |||
load(child, prefix + name + '.') | |||
load(module) | |||
load = None # break load->load reference cycle | |||
# ignore "num_batches_tracked" of BN layers | |||
missing_keys = [ | |||
key for key in all_missing_keys if 'num_batches_tracked' not in key | |||
] | |||
if unexpected_keys: | |||
err_msg.append('unexpected key in source ' | |||
f'state_dict: {", ".join(unexpected_keys)}\n') | |||
if missing_keys: | |||
err_msg.append( | |||
f'missing keys in source state_dict: {", ".join(missing_keys)}\n') | |||
rank, _ = get_dist_info() | |||
if len(err_msg) > 0 and rank == 0: | |||
err_msg.insert( | |||
0, 'The model and loaded state dict do not match exactly\n') | |||
err_msg = '\n'.join(err_msg) | |||
if strict: | |||
raise RuntimeError(err_msg) | |||
elif logger is not None: | |||
logger.warning(err_msg) | |||
else: | |||
print(err_msg) | |||
def load_url_dist(url, model_dir=None): | |||
"""In distributed setting, this function only download checkpoint at local | |||
rank 0.""" | |||
rank, world_size = get_dist_info() | |||
rank = int(os.environ.get('LOCAL_RANK', rank)) | |||
if rank == 0: | |||
checkpoint = model_zoo.load_url(url, model_dir=model_dir) | |||
if world_size > 1: | |||
torch.distributed.barrier() | |||
if rank > 0: | |||
checkpoint = model_zoo.load_url(url, model_dir=model_dir) | |||
return checkpoint | |||
def get_torchvision_models(): | |||
model_urls = dict() | |||
for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): | |||
if ispkg: | |||
continue | |||
_zoo = import_module(f'torchvision.models.{name}') | |||
if hasattr(_zoo, 'model_urls'): | |||
_urls = getattr(_zoo, 'model_urls') | |||
model_urls.update(_urls) | |||
return model_urls | |||
def _load_checkpoint(filename, map_location=None): | |||
"""Load checkpoint from somewhere (modelzoo, file, url). | |||
Args: | |||
filename (str): Accept local filepath, URL, ``torchvision://xxx``, | |||
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for | |||
details. | |||
map_location (str | None): Same as :func:`torch.load`. Default: None. | |||
Returns: | |||
dict | OrderedDict: The loaded checkpoint. It can be either an | |||
OrderedDict storing model weights or a dict containing other | |||
information, which depends on the checkpoint. | |||
""" | |||
if filename.startswith('modelzoo://'): | |||
warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' | |||
'use "torchvision://" instead') | |||
model_urls = get_torchvision_models() | |||
model_name = filename[11:] | |||
checkpoint = load_url_dist(model_urls[model_name]) | |||
else: | |||
if not osp.isfile(filename): | |||
raise IOError(f'{filename} is not a checkpoint file') | |||
checkpoint = torch.load(filename, map_location=map_location) | |||
return checkpoint | |||
def load_checkpoint(model, | |||
filename, | |||
map_location='cpu', | |||
strict=False, | |||
logger=None): | |||
"""Load checkpoint from a file or URI. | |||
Args: | |||
model (Module): Module to load checkpoint. | |||
filename (str): Accept local filepath, URL, ``torchvision://xxx``, | |||
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for | |||
details. | |||
map_location (str): Same as :func:`torch.load`. | |||
strict (bool): Whether to allow different params for the model and | |||
checkpoint. | |||
logger (:mod:`logging.Logger` or None): The logger for error message. | |||
Returns: | |||
dict or OrderedDict: The loaded checkpoint. | |||
""" | |||
checkpoint = _load_checkpoint(filename, map_location) | |||
# OrderedDict is a subclass of dict | |||
if not isinstance(checkpoint, dict): | |||
raise RuntimeError( | |||
f'No state_dict found in checkpoint file {filename}') | |||
# get state_dict from checkpoint | |||
if 'state_dict' in checkpoint: | |||
state_dict = checkpoint['state_dict'] | |||
elif 'model' in checkpoint: | |||
state_dict = checkpoint['model'] | |||
else: | |||
state_dict = checkpoint | |||
# strip prefix of state_dict | |||
if list(state_dict.keys())[0].startswith('module.'): | |||
state_dict = {k[7:]: v for k, v in state_dict.items()} | |||
# for MoBY, load model of online branch | |||
if sorted(list(state_dict.keys()))[0].startswith('encoder'): | |||
state_dict = { | |||
k.replace('encoder.', ''): v | |||
for k, v in state_dict.items() if k.startswith('encoder.') | |||
} | |||
# reshape absolute position embedding | |||
if state_dict.get('absolute_pos_embed') is not None: | |||
absolute_pos_embed = state_dict['absolute_pos_embed'] | |||
N1, L, C1 = absolute_pos_embed.size() | |||
N2, C2, H, W = model.absolute_pos_embed.size() | |||
if N1 != N2 or C1 != C2 or L != H * W: | |||
logger.warning('Error in loading absolute_pos_embed, pass') | |||
else: | |||
state_dict['absolute_pos_embed'] = absolute_pos_embed.view( | |||
N2, H, W, C2).permute(0, 3, 1, 2) | |||
# interpolate position bias table if needed | |||
relative_position_bias_table_keys = [ | |||
k for k in state_dict.keys() if 'relative_position_bias_table' in k | |||
] | |||
for table_key in relative_position_bias_table_keys: | |||
table_pretrained = state_dict[table_key] | |||
table_current = model.state_dict()[table_key] | |||
L1, nH1 = table_pretrained.size() | |||
L2, nH2 = table_current.size() | |||
if nH1 != nH2: | |||
logger.warning(f'Error in loading {table_key}, pass') | |||
else: | |||
if L1 != L2: | |||
S1 = int(L1**0.5) | |||
S2 = int(L2**0.5) | |||
table_pretrained_resized = F.interpolate( | |||
table_pretrained.permute(1, 0).view(1, nH1, S1, S1), | |||
size=(S2, S2), | |||
mode='bicubic') | |||
state_dict[table_key] = table_pretrained_resized.view( | |||
nH2, L2).permute(1, 0) | |||
# load state_dict | |||
load_state_dict(model, state_dict, strict, logger) | |||
return checkpoint |
@@ -0,0 +1,706 @@ | |||
# The implementation is adopted from Swin Transformer | |||
# made publicly available under the MIT License at https://github.com/microsoft/Swin-Transformer | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
import torch.utils.checkpoint as checkpoint | |||
from timm.models.layers import DropPath, to_2tuple, trunc_normal_ | |||
from .newcrf_utils import load_checkpoint | |||
class Mlp(nn.Module): | |||
""" Multilayer perceptron.""" | |||
def __init__(self, | |||
in_features, | |||
hidden_features=None, | |||
out_features=None, | |||
act_layer=nn.GELU, | |||
drop=0.): | |||
super().__init__() | |||
out_features = out_features or in_features | |||
hidden_features = hidden_features or in_features | |||
self.fc1 = nn.Linear(in_features, hidden_features) | |||
self.act = act_layer() | |||
self.fc2 = nn.Linear(hidden_features, out_features) | |||
self.drop = nn.Dropout(drop) | |||
def forward(self, x): | |||
x = self.fc1(x) | |||
x = self.act(x) | |||
x = self.drop(x) | |||
x = self.fc2(x) | |||
x = self.drop(x) | |||
return x | |||
def window_partition(x, window_size): | |||
""" | |||
Args: | |||
x: (B, H, W, C) | |||
window_size (int): window size | |||
Returns: | |||
windows: (num_windows*B, window_size, window_size, C) | |||
""" | |||
B, H, W, C = x.shape | |||
x = x.view(B, H // window_size, window_size, W // window_size, window_size, | |||
C) | |||
windows = x.permute(0, 1, 3, 2, 4, | |||
5).contiguous().view(-1, window_size, window_size, C) | |||
return windows | |||
def window_reverse(windows, window_size, H, W): | |||
""" | |||
Args: | |||
windows: (num_windows*B, window_size, window_size, C) | |||
window_size (int): Window size | |||
H (int): Height of image | |||
W (int): Width of image | |||
Returns: | |||
x: (B, H, W, C) | |||
""" | |||
B = int(windows.shape[0] / (H * W / window_size / window_size)) | |||
x = windows.view(B, H // window_size, W // window_size, window_size, | |||
window_size, -1) | |||
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) | |||
return x | |||
class WindowAttention(nn.Module): | |||
""" Window based multi-head self attention (W-MSA) module with relative position bias. | |||
It supports both of shifted and non-shifted window. | |||
Args: | |||
dim (int): Number of input channels. | |||
window_size (tuple[int]): The height and width of the window. | |||
num_heads (int): Number of attention heads. | |||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set | |||
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 | |||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0 | |||
""" | |||
def __init__(self, | |||
dim, | |||
window_size, | |||
num_heads, | |||
qkv_bias=True, | |||
qk_scale=None, | |||
attn_drop=0., | |||
proj_drop=0.): | |||
super().__init__() | |||
self.dim = dim | |||
self.window_size = window_size # Wh, Ww | |||
self.num_heads = num_heads | |||
head_dim = dim // num_heads | |||
self.scale = qk_scale or head_dim**-0.5 | |||
# define a parameter table of relative position bias | |||
self.relative_position_bias_table = nn.Parameter( | |||
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), | |||
num_heads)) # 2*Wh-1 * 2*Ww-1, nH | |||
# get pair-wise relative position index for each token inside the window | |||
coords_h = torch.arange(self.window_size[0]) | |||
coords_w = torch.arange(self.window_size[1]) | |||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww | |||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww | |||
relative_coords = coords_flatten[:, :, | |||
None] - coords_flatten[:, | |||
None, :] # 2, Wh*Ww, Wh*Ww | |||
relative_coords = relative_coords.permute( | |||
1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 | |||
relative_coords[:, :, | |||
0] += self.window_size[0] - 1 # shift to start from 0 | |||
relative_coords[:, :, 1] += self.window_size[1] - 1 | |||
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 | |||
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww | |||
self.register_buffer('relative_position_index', | |||
relative_position_index) | |||
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) | |||
self.attn_drop = nn.Dropout(attn_drop) | |||
self.proj = nn.Linear(dim, dim) | |||
self.proj_drop = nn.Dropout(proj_drop) | |||
trunc_normal_(self.relative_position_bias_table, std=.02) | |||
self.softmax = nn.Softmax(dim=-1) | |||
def forward(self, x, mask=None): | |||
""" Forward function. | |||
Args: | |||
x: input features with shape of (num_windows*B, N, C) | |||
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None | |||
""" | |||
B_, N, C = x.shape | |||
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, | |||
C // self.num_heads).permute(2, 0, 3, 1, 4) | |||
q, k, v = qkv[0], qkv[1], qkv[ | |||
2] # make torchscript happy (cannot use tensor as tuple) | |||
q = q * self.scale | |||
attn = (q @ k.transpose(-2, -1)) | |||
relative_position_bias = self.relative_position_bias_table[ | |||
self.relative_position_index.view(-1)].view( | |||
self.window_size[0] * self.window_size[1], | |||
self.window_size[0] * self.window_size[1], | |||
-1) # Wh*Ww,Wh*Ww,nH | |||
relative_position_bias = relative_position_bias.permute( | |||
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww | |||
attn = attn + relative_position_bias.unsqueeze(0) | |||
if mask is not None: | |||
nW = mask.shape[0] | |||
attn = attn.view(B_ // nW, nW, self.num_heads, N, | |||
N) + mask.unsqueeze(1).unsqueeze(0) | |||
attn = attn.view(-1, self.num_heads, N, N) | |||
attn = self.softmax(attn) | |||
else: | |||
attn = self.softmax(attn) | |||
attn = self.attn_drop(attn) | |||
x = (attn @ v).transpose(1, 2).reshape(B_, N, C) | |||
x = self.proj(x) | |||
x = self.proj_drop(x) | |||
return x | |||
class SwinTransformerBlock(nn.Module): | |||
""" Swin Transformer Block. | |||
Args: | |||
dim (int): Number of input channels. | |||
num_heads (int): Number of attention heads. | |||
window_size (int): Window size. | |||
shift_size (int): Shift size for SW-MSA. | |||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. | |||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. | |||
drop (float, optional): Dropout rate. Default: 0.0 | |||
attn_drop (float, optional): Attention dropout rate. Default: 0.0 | |||
drop_path (float, optional): Stochastic depth rate. Default: 0.0 | |||
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU | |||
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm | |||
""" | |||
def __init__(self, | |||
dim, | |||
num_heads, | |||
window_size=7, | |||
shift_size=0, | |||
mlp_ratio=4., | |||
qkv_bias=True, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
act_layer=nn.GELU, | |||
norm_layer=nn.LayerNorm): | |||
super().__init__() | |||
self.dim = dim | |||
self.num_heads = num_heads | |||
self.window_size = window_size | |||
self.shift_size = shift_size | |||
self.mlp_ratio = mlp_ratio | |||
assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size' | |||
self.norm1 = norm_layer(dim) | |||
self.attn = WindowAttention( | |||
dim, | |||
window_size=to_2tuple(self.window_size), | |||
num_heads=num_heads, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
attn_drop=attn_drop, | |||
proj_drop=drop) | |||
self.drop_path = DropPath( | |||
drop_path) if drop_path > 0. else nn.Identity() | |||
self.norm2 = norm_layer(dim) | |||
mlp_hidden_dim = int(dim * mlp_ratio) | |||
self.mlp = Mlp( | |||
in_features=dim, | |||
hidden_features=mlp_hidden_dim, | |||
act_layer=act_layer, | |||
drop=drop) | |||
self.H = None | |||
self.W = None | |||
def forward(self, x, mask_matrix): | |||
""" Forward function. | |||
Args: | |||
x: Input feature, tensor size (B, H*W, C). | |||
H, W: Spatial resolution of the input feature. | |||
mask_matrix: Attention mask for cyclic shift. | |||
""" | |||
B, L, C = x.shape | |||
H, W = self.H, self.W | |||
assert L == H * W, 'input feature has wrong size' | |||
shortcut = x | |||
x = self.norm1(x) | |||
x = x.view(B, H, W, C) | |||
# pad feature maps to multiples of window size | |||
pad_l = pad_t = 0 | |||
pad_r = (self.window_size - W % self.window_size) % self.window_size | |||
pad_b = (self.window_size - H % self.window_size) % self.window_size | |||
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) | |||
_, Hp, Wp, _ = x.shape | |||
# cyclic shift | |||
if self.shift_size > 0: | |||
shifted_x = torch.roll( | |||
x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) | |||
attn_mask = mask_matrix | |||
else: | |||
shifted_x = x | |||
attn_mask = None | |||
# partition windows | |||
x_windows = window_partition( | |||
shifted_x, self.window_size) # nW*B, window_size, window_size, C | |||
x_windows = x_windows.view(-1, self.window_size * self.window_size, | |||
C) # nW*B, window_size*window_size, C | |||
# W-MSA/SW-MSA | |||
attn_windows = self.attn( | |||
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C | |||
# merge windows | |||
attn_windows = attn_windows.view(-1, self.window_size, | |||
self.window_size, C) | |||
shifted_x = window_reverse(attn_windows, self.window_size, Hp, | |||
Wp) # B H' W' C | |||
# reverse cyclic shift | |||
if self.shift_size > 0: | |||
x = torch.roll( | |||
shifted_x, | |||
shifts=(self.shift_size, self.shift_size), | |||
dims=(1, 2)) | |||
else: | |||
x = shifted_x | |||
if pad_r > 0 or pad_b > 0: | |||
x = x[:, :H, :W, :].contiguous() | |||
x = x.view(B, H * W, C) | |||
# FFN | |||
x = shortcut + self.drop_path(x) | |||
x = x + self.drop_path(self.mlp(self.norm2(x))) | |||
return x | |||
class PatchMerging(nn.Module): | |||
""" Patch Merging Layer | |||
Args: | |||
dim (int): Number of input channels. | |||
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm | |||
""" | |||
def __init__(self, dim, norm_layer=nn.LayerNorm): | |||
super().__init__() | |||
self.dim = dim | |||
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) | |||
self.norm = norm_layer(4 * dim) | |||
def forward(self, x, H, W): | |||
""" Forward function. | |||
Args: | |||
x: Input feature, tensor size (B, H*W, C). | |||
H, W: Spatial resolution of the input feature. | |||
""" | |||
B, L, C = x.shape | |||
assert L == H * W, 'input feature has wrong size' | |||
x = x.view(B, H, W, C) | |||
# padding | |||
pad_input = (H % 2 == 1) or (W % 2 == 1) | |||
if pad_input: | |||
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) | |||
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C | |||
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C | |||
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C | |||
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C | |||
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C | |||
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C | |||
x = self.norm(x) | |||
x = self.reduction(x) | |||
return x | |||
class BasicLayer(nn.Module): | |||
""" A basic Swin Transformer layer for one stage. | |||
Args: | |||
dim (int): Number of feature channels | |||
depth (int): Depths of this stage. | |||
num_heads (int): Number of attention head. | |||
window_size (int): Local window size. Default: 7. | |||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. | |||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. | |||
drop (float, optional): Dropout rate. Default: 0.0 | |||
attn_drop (float, optional): Attention dropout rate. Default: 0.0 | |||
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 | |||
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm | |||
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None | |||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. | |||
""" | |||
def __init__(self, | |||
dim, | |||
depth, | |||
num_heads, | |||
window_size=7, | |||
mlp_ratio=4., | |||
qkv_bias=True, | |||
qk_scale=None, | |||
drop=0., | |||
attn_drop=0., | |||
drop_path=0., | |||
norm_layer=nn.LayerNorm, | |||
downsample=None, | |||
use_checkpoint=False): | |||
super().__init__() | |||
self.window_size = window_size | |||
self.shift_size = window_size // 2 | |||
self.depth = depth | |||
self.use_checkpoint = use_checkpoint | |||
# build blocks | |||
self.blocks = nn.ModuleList([ | |||
SwinTransformerBlock( | |||
dim=dim, | |||
num_heads=num_heads, | |||
window_size=window_size, | |||
shift_size=0 if (i % 2 == 0) else window_size // 2, | |||
mlp_ratio=mlp_ratio, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
drop=drop, | |||
attn_drop=attn_drop, | |||
drop_path=drop_path[i] | |||
if isinstance(drop_path, list) else drop_path, | |||
norm_layer=norm_layer) for i in range(depth) | |||
]) | |||
# patch merging layer | |||
if downsample is not None: | |||
self.downsample = downsample(dim=dim, norm_layer=norm_layer) | |||
else: | |||
self.downsample = None | |||
def forward(self, x, H, W): | |||
""" Forward function. | |||
Args: | |||
x: Input feature, tensor size (B, H*W, C). | |||
H, W: Spatial resolution of the input feature. | |||
""" | |||
# calculate attention mask for SW-MSA | |||
Hp = int(np.ceil(H / self.window_size)) * self.window_size | |||
Wp = int(np.ceil(W / self.window_size)) * self.window_size | |||
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 | |||
h_slices = (slice(0, -self.window_size), | |||
slice(-self.window_size, | |||
-self.shift_size), slice(-self.shift_size, None)) | |||
w_slices = (slice(0, -self.window_size), | |||
slice(-self.window_size, | |||
-self.shift_size), slice(-self.shift_size, None)) | |||
cnt = 0 | |||
for h in h_slices: | |||
for w in w_slices: | |||
img_mask[:, h, w, :] = cnt | |||
cnt += 1 | |||
mask_windows = window_partition( | |||
img_mask, self.window_size) # nW, window_size, window_size, 1 | |||
mask_windows = mask_windows.view(-1, | |||
self.window_size * self.window_size) | |||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) | |||
attn_mask = attn_mask.masked_fill(attn_mask != 0, | |||
float(-100.0)).masked_fill( | |||
attn_mask == 0, float(0.0)) | |||
for blk in self.blocks: | |||
blk.H, blk.W = H, W | |||
if self.use_checkpoint: | |||
x = checkpoint.checkpoint(blk, x, attn_mask) | |||
else: | |||
x = blk(x, attn_mask) | |||
if self.downsample is not None: | |||
x_down = self.downsample(x, H, W) | |||
Wh, Ww = (H + 1) // 2, (W + 1) // 2 | |||
return x, H, W, x_down, Wh, Ww | |||
else: | |||
return x, H, W, x, H, W | |||
class PatchEmbed(nn.Module): | |||
""" Image to Patch Embedding | |||
Args: | |||
patch_size (int): Patch token size. Default: 4. | |||
in_chans (int): Number of input image channels. Default: 3. | |||
embed_dim (int): Number of linear projection output channels. Default: 96. | |||
norm_layer (nn.Module, optional): Normalization layer. Default: None | |||
""" | |||
def __init__(self, | |||
patch_size=4, | |||
in_chans=3, | |||
embed_dim=96, | |||
norm_layer=None): | |||
super().__init__() | |||
patch_size = to_2tuple(patch_size) | |||
self.patch_size = patch_size | |||
self.in_chans = in_chans | |||
self.embed_dim = embed_dim | |||
self.proj = nn.Conv2d( | |||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) | |||
if norm_layer is not None: | |||
self.norm = norm_layer(embed_dim) | |||
else: | |||
self.norm = None | |||
def forward(self, x): | |||
"""Forward function.""" | |||
# padding | |||
_, _, H, W = x.size() | |||
if W % self.patch_size[1] != 0: | |||
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) | |||
if H % self.patch_size[0] != 0: | |||
x = F.pad(x, | |||
(0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) | |||
x = self.proj(x) # B C Wh Ww | |||
if self.norm is not None: | |||
Wh, Ww = x.size(2), x.size(3) | |||
x = x.flatten(2).transpose(1, 2) | |||
x = self.norm(x) | |||
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) | |||
return x | |||
class SwinTransformer(nn.Module): | |||
""" Swin Transformer backbone. | |||
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - | |||
https://arxiv.org/pdf/2103.14030 | |||
Args: | |||
pretrain_img_size (int): Input image size for training the pretrained model, | |||
used in absolute postion embedding. Default 224. | |||
patch_size (int | tuple(int)): Patch size. Default: 4. | |||
in_chans (int): Number of input image channels. Default: 3. | |||
embed_dim (int): Number of linear projection output channels. Default: 96. | |||
depths (tuple[int]): Depths of each Swin Transformer stage. | |||
num_heads (tuple[int]): Number of attention head of each stage. | |||
window_size (int): Window size. Default: 7. | |||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. | |||
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True | |||
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. | |||
drop_rate (float): Dropout rate. | |||
attn_drop_rate (float): Attention dropout rate. Default: 0. | |||
drop_path_rate (float): Stochastic depth rate. Default: 0.2. | |||
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. | |||
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. | |||
patch_norm (bool): If True, add normalization after patch embedding. Default: True. | |||
out_indices (Sequence[int]): Output from which stages. | |||
frozen_stages (int): Stages to be frozen (stop grad and set eval mode). | |||
-1 means not freezing any parameters. | |||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. | |||
""" | |||
def __init__(self, | |||
pretrain_img_size=224, | |||
patch_size=4, | |||
in_chans=3, | |||
embed_dim=96, | |||
depths=[2, 2, 6, 2], | |||
num_heads=[3, 6, 12, 24], | |||
window_size=7, | |||
mlp_ratio=4., | |||
qkv_bias=True, | |||
qk_scale=None, | |||
drop_rate=0., | |||
attn_drop_rate=0., | |||
drop_path_rate=0.2, | |||
norm_layer=nn.LayerNorm, | |||
ape=False, | |||
patch_norm=True, | |||
out_indices=(0, 1, 2, 3), | |||
frozen_stages=-1, | |||
use_checkpoint=False): | |||
super().__init__() | |||
self.pretrain_img_size = pretrain_img_size | |||
self.num_layers = len(depths) | |||
self.embed_dim = embed_dim | |||
self.ape = ape | |||
self.patch_norm = patch_norm | |||
self.out_indices = out_indices | |||
self.frozen_stages = frozen_stages | |||
# split image into non-overlapping patches | |||
self.patch_embed = PatchEmbed( | |||
patch_size=patch_size, | |||
in_chans=in_chans, | |||
embed_dim=embed_dim, | |||
norm_layer=norm_layer if self.patch_norm else None) | |||
# absolute position embedding | |||
if self.ape: | |||
pretrain_img_size = to_2tuple(pretrain_img_size) | |||
patch_size = to_2tuple(patch_size) | |||
patches_resolution = [ | |||
pretrain_img_size[0] // patch_size[0], | |||
pretrain_img_size[1] // patch_size[1] | |||
] | |||
self.absolute_pos_embed = nn.Parameter( | |||
torch.zeros(1, embed_dim, patches_resolution[0], | |||
patches_resolution[1])) | |||
trunc_normal_(self.absolute_pos_embed, std=.02) | |||
self.pos_drop = nn.Dropout(p=drop_rate) | |||
# stochastic depth | |||
dpr = [ | |||
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) | |||
] # stochastic depth decay rule | |||
# build layers | |||
self.layers = nn.ModuleList() | |||
for i_layer in range(self.num_layers): | |||
layer = BasicLayer( | |||
dim=int(embed_dim * 2**i_layer), | |||
depth=depths[i_layer], | |||
num_heads=num_heads[i_layer], | |||
window_size=window_size, | |||
mlp_ratio=mlp_ratio, | |||
qkv_bias=qkv_bias, | |||
qk_scale=qk_scale, | |||
drop=drop_rate, | |||
attn_drop=attn_drop_rate, | |||
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], | |||
norm_layer=norm_layer, | |||
downsample=PatchMerging if | |||
(i_layer < self.num_layers - 1) else None, | |||
use_checkpoint=use_checkpoint) | |||
self.layers.append(layer) | |||
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] | |||
self.num_features = num_features | |||
# add a norm layer for each output | |||
for i_layer in out_indices: | |||
layer = norm_layer(num_features[i_layer]) | |||
layer_name = f'norm{i_layer}' | |||
self.add_module(layer_name, layer) | |||
self._freeze_stages() | |||
def _freeze_stages(self): | |||
if self.frozen_stages >= 0: | |||
self.patch_embed.eval() | |||
for param in self.patch_embed.parameters(): | |||
param.requires_grad = False | |||
if self.frozen_stages >= 1 and self.ape: | |||
self.absolute_pos_embed.requires_grad = False | |||
if self.frozen_stages >= 2: | |||
self.pos_drop.eval() | |||
for i in range(0, self.frozen_stages - 1): | |||
m = self.layers[i] | |||
m.eval() | |||
for param in m.parameters(): | |||
param.requires_grad = False | |||
def init_weights(self, pretrained=None): | |||
"""Initialize the weights in backbone. | |||
Args: | |||
pretrained (str, optional): Path to pre-trained weights. | |||
Defaults to None. | |||
""" | |||
def _init_weights(m): | |||
if isinstance(m, nn.Linear): | |||
trunc_normal_(m.weight, std=.02) | |||
if isinstance(m, nn.Linear) and m.bias is not None: | |||
nn.init.constant_(m.bias, 0) | |||
elif isinstance(m, nn.LayerNorm): | |||
nn.init.constant_(m.bias, 0) | |||
nn.init.constant_(m.weight, 1.0) | |||
if isinstance(pretrained, str): | |||
self.apply(_init_weights) | |||
# logger = get_root_logger() | |||
load_checkpoint(self, pretrained, strict=False) | |||
elif pretrained is None: | |||
self.apply(_init_weights) | |||
else: | |||
raise TypeError('pretrained must be a str or None') | |||
def forward(self, x): | |||
"""Forward function.""" | |||
x = self.patch_embed(x) | |||
Wh, Ww = x.size(2), x.size(3) | |||
if self.ape: | |||
# interpolate the position embedding to the corresponding size | |||
absolute_pos_embed = F.interpolate( | |||
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') | |||
x = (x + absolute_pos_embed).flatten(2).transpose(1, | |||
2) # B Wh*Ww C | |||
else: | |||
x = x.flatten(2).transpose(1, 2) | |||
x = self.pos_drop(x) | |||
outs = [] | |||
for i in range(self.num_layers): | |||
layer = self.layers[i] | |||
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) | |||
if i in self.out_indices: | |||
norm_layer = getattr(self, f'norm{i}') | |||
x_out = norm_layer(x_out) | |||
out = x_out.view(-1, H, W, | |||
self.num_features[i]).permute(0, 3, 1, | |||
2).contiguous() | |||
outs.append(out) | |||
return tuple(outs) | |||
def train(self, mode=True): | |||
"""Convert the model into training mode while keep layers freezed.""" | |||
super(SwinTransformer, self).train(mode) | |||
self._freeze_stages() |
@@ -0,0 +1,365 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from mmcv.cnn import ConvModule | |||
from .newcrf_utils import normal_init, resize | |||
class PPM(nn.ModuleList): | |||
"""Pooling Pyramid Module used in PSPNet. | |||
Args: | |||
pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid | |||
Module. | |||
in_channels (int): Input channels. | |||
channels (int): Channels after modules, before conv_seg. | |||
conv_cfg (dict|None): Config of conv layers. | |||
norm_cfg (dict|None): Config of norm layers. | |||
act_cfg (dict): Config of activation layers. | |||
align_corners (bool): align_corners argument of F.interpolate. | |||
""" | |||
def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg, | |||
act_cfg, align_corners): | |||
super(PPM, self).__init__() | |||
self.pool_scales = pool_scales | |||
self.align_corners = align_corners | |||
self.in_channels = in_channels | |||
self.channels = channels | |||
self.conv_cfg = conv_cfg | |||
self.norm_cfg = norm_cfg | |||
self.act_cfg = act_cfg | |||
for pool_scale in pool_scales: | |||
# == if batch size = 1, BN is not supported, change to GN | |||
if pool_scale == 1: | |||
norm_cfg = dict(type='GN', requires_grad=True, num_groups=256) | |||
self.append( | |||
nn.Sequential( | |||
nn.AdaptiveAvgPool2d(pool_scale), | |||
ConvModule( | |||
self.in_channels, | |||
self.channels, | |||
1, | |||
conv_cfg=self.conv_cfg, | |||
norm_cfg=norm_cfg, | |||
act_cfg=self.act_cfg))) | |||
def forward(self, x): | |||
"""Forward function.""" | |||
ppm_outs = [] | |||
for ppm in self: | |||
ppm_out = ppm(x) | |||
upsampled_ppm_out = resize( | |||
ppm_out, | |||
size=x.size()[2:], | |||
mode='bilinear', | |||
align_corners=self.align_corners) | |||
ppm_outs.append(upsampled_ppm_out) | |||
return ppm_outs | |||
class BaseDecodeHead(nn.Module): | |||
"""Base class for BaseDecodeHead. | |||
Args: | |||
in_channels (int|Sequence[int]): Input channels. | |||
channels (int): Channels after modules, before conv_seg. | |||
num_classes (int): Number of classes. | |||
dropout_ratio (float): Ratio of dropout layer. Default: 0.1. | |||
conv_cfg (dict|None): Config of conv layers. Default: None. | |||
norm_cfg (dict|None): Config of norm layers. Default: None. | |||
act_cfg (dict): Config of activation layers. | |||
Default: dict(type='ReLU') | |||
in_index (int|Sequence[int]): Input feature index. Default: -1 | |||
input_transform (str|None): Transformation type of input features. | |||
Options: 'resize_concat', 'multiple_select', None. | |||
'resize_concat': Multiple feature maps will be resize to the | |||
same size as first one and than concat together. | |||
Usually used in FCN head of HRNet. | |||
'multiple_select': Multiple feature maps will be bundle into | |||
a list and passed into decode head. | |||
None: Only one select feature map is allowed. | |||
Default: None. | |||
loss_decode (dict): Config of decode loss. | |||
Default: dict(type='CrossEntropyLoss'). | |||
ignore_index (int | None): The label index to be ignored. When using | |||
masked BCE loss, ignore_index should be set to None. Default: 255 | |||
sampler (dict|None): The config of segmentation map sampler. | |||
Default: None. | |||
align_corners (bool): align_corners argument of F.interpolate. | |||
Default: False. | |||
""" | |||
def __init__(self, | |||
in_channels, | |||
channels, | |||
*, | |||
num_classes, | |||
dropout_ratio=0.1, | |||
conv_cfg=None, | |||
norm_cfg=None, | |||
act_cfg=dict(type='ReLU'), | |||
in_index=-1, | |||
input_transform=None, | |||
loss_decode=dict( | |||
type='CrossEntropyLoss', | |||
use_sigmoid=False, | |||
loss_weight=1.0), | |||
ignore_index=255, | |||
sampler=None, | |||
align_corners=False): | |||
super(BaseDecodeHead, self).__init__() | |||
self._init_inputs(in_channels, in_index, input_transform) | |||
self.channels = channels | |||
self.num_classes = num_classes | |||
self.dropout_ratio = dropout_ratio | |||
self.conv_cfg = conv_cfg | |||
self.norm_cfg = norm_cfg | |||
self.act_cfg = act_cfg | |||
self.in_index = in_index | |||
# self.loss_decode = build_loss(loss_decode) | |||
self.ignore_index = ignore_index | |||
self.align_corners = align_corners | |||
# if sampler is not None: | |||
# self.sampler = build_pixel_sampler(sampler, context=self) | |||
# else: | |||
# self.sampler = None | |||
# self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) | |||
# self.conv1 = nn.Conv2d(channels, num_classes, 3, padding=1) | |||
if dropout_ratio > 0: | |||
self.dropout = nn.Dropout2d(dropout_ratio) | |||
else: | |||
self.dropout = None | |||
self.fp16_enabled = False | |||
def extra_repr(self): | |||
"""Extra repr.""" | |||
s = f'input_transform={self.input_transform}, ' \ | |||
f'ignore_index={self.ignore_index}, ' \ | |||
f'align_corners={self.align_corners}' | |||
return s | |||
def _init_inputs(self, in_channels, in_index, input_transform): | |||
"""Check and initialize input transforms. | |||
The in_channels, in_index and input_transform must match. | |||
Specifically, when input_transform is None, only single feature map | |||
will be selected. So in_channels and in_index must be of type int. | |||
When input_transform | |||
Args: | |||
in_channels (int|Sequence[int]): Input channels. | |||
in_index (int|Sequence[int]): Input feature index. | |||
input_transform (str|None): Transformation type of input features. | |||
Options: 'resize_concat', 'multiple_select', None. | |||
'resize_concat': Multiple feature maps will be resize to the | |||
same size as first one and than concat together. | |||
Usually used in FCN head of HRNet. | |||
'multiple_select': Multiple feature maps will be bundle into | |||
a list and passed into decode head. | |||
None: Only one select feature map is allowed. | |||
""" | |||
if input_transform is not None: | |||
assert input_transform in ['resize_concat', 'multiple_select'] | |||
self.input_transform = input_transform | |||
self.in_index = in_index | |||
if input_transform is not None: | |||
assert isinstance(in_channels, (list, tuple)) | |||
assert isinstance(in_index, (list, tuple)) | |||
assert len(in_channels) == len(in_index) | |||
if input_transform == 'resize_concat': | |||
self.in_channels = sum(in_channels) | |||
else: | |||
self.in_channels = in_channels | |||
else: | |||
assert isinstance(in_channels, int) | |||
assert isinstance(in_index, int) | |||
self.in_channels = in_channels | |||
def init_weights(self): | |||
"""Initialize weights of classification layer.""" | |||
# normal_init(self.conv_seg, mean=0, std=0.01) | |||
# normal_init(self.conv1, mean=0, std=0.01) | |||
def _transform_inputs(self, inputs): | |||
"""Transform inputs for decoder. | |||
Args: | |||
inputs (list[Tensor]): List of multi-level img features. | |||
Returns: | |||
Tensor: The transformed inputs | |||
""" | |||
if self.input_transform == 'resize_concat': | |||
inputs = [inputs[i] for i in self.in_index] | |||
upsampled_inputs = [ | |||
resize( | |||
input=x, | |||
size=inputs[0].shape[2:], | |||
mode='bilinear', | |||
align_corners=self.align_corners) for x in inputs | |||
] | |||
inputs = torch.cat(upsampled_inputs, dim=1) | |||
elif self.input_transform == 'multiple_select': | |||
inputs = [inputs[i] for i in self.in_index] | |||
else: | |||
inputs = inputs[self.in_index] | |||
return inputs | |||
def forward(self, inputs): | |||
"""Placeholder of forward function.""" | |||
pass | |||
def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): | |||
"""Forward function for training. | |||
Args: | |||
inputs (list[Tensor]): List of multi-level img features. | |||
img_metas (list[dict]): List of image info dict where each dict | |||
has: 'img_shape', 'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
`mmseg/datasets/pipelines/formatting.py:Collect`. | |||
gt_semantic_seg (Tensor): Semantic segmentation masks | |||
used if the architecture supports semantic segmentation task. | |||
train_cfg (dict): The training config. | |||
Returns: | |||
dict[str, Tensor]: a dictionary of loss components | |||
""" | |||
seg_logits = self.forward(inputs) | |||
losses = self.losses(seg_logits, gt_semantic_seg) | |||
return losses | |||
def forward_test(self, inputs, img_metas, test_cfg): | |||
"""Forward function for testing. | |||
Args: | |||
inputs (list[Tensor]): List of multi-level img features. | |||
img_metas (list[dict]): List of image info dict where each dict | |||
has: 'img_shape', 'scale_factor', 'flip', and may also contain | |||
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. | |||
For details on the values of these keys see | |||
`mmseg/datasets/pipelines/formatting.py:Collect`. | |||
test_cfg (dict): The testing config. | |||
Returns: | |||
Tensor: Output segmentation map. | |||
""" | |||
return self.forward(inputs) | |||
class UPerHead(BaseDecodeHead): | |||
def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): | |||
super(UPerHead, self).__init__( | |||
input_transform='multiple_select', **kwargs) | |||
# FPN Module | |||
self.lateral_convs = nn.ModuleList() | |||
self.fpn_convs = nn.ModuleList() | |||
for in_channels in self.in_channels: # skip the top layer | |||
l_conv = ConvModule( | |||
in_channels, | |||
self.channels, | |||
1, | |||
conv_cfg=self.conv_cfg, | |||
norm_cfg=self.norm_cfg, | |||
act_cfg=self.act_cfg, | |||
inplace=True) | |||
fpn_conv = ConvModule( | |||
self.channels, | |||
self.channels, | |||
3, | |||
padding=1, | |||
conv_cfg=self.conv_cfg, | |||
norm_cfg=self.norm_cfg, | |||
act_cfg=self.act_cfg, | |||
inplace=True) | |||
self.lateral_convs.append(l_conv) | |||
self.fpn_convs.append(fpn_conv) | |||
def forward(self, inputs): | |||
"""Forward function.""" | |||
inputs = self._transform_inputs(inputs) | |||
# build laterals | |||
laterals = [ | |||
lateral_conv(inputs[i]) | |||
for i, lateral_conv in enumerate(self.lateral_convs) | |||
] | |||
# laterals.append(self.psp_forward(inputs)) | |||
# build top-down path | |||
used_backbone_levels = len(laterals) | |||
for i in range(used_backbone_levels - 1, 0, -1): | |||
prev_shape = laterals[i - 1].shape[2:] | |||
laterals[i - 1] += resize( | |||
laterals[i], | |||
size=prev_shape, | |||
mode='bilinear', | |||
align_corners=self.align_corners) | |||
# build outputs | |||
fpn_outs = [ | |||
self.fpn_convs[i](laterals[i]) | |||
for i in range(used_backbone_levels - 1) | |||
] | |||
# append psp feature | |||
fpn_outs.append(laterals[-1]) | |||
return fpn_outs[0] | |||
class PSP(BaseDecodeHead): | |||
"""Unified Perceptual Parsing for Scene Understanding. | |||
This head is the implementation of `UPerNet | |||
<https://arxiv.org/abs/1807.10221>`_. | |||
Args: | |||
pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid | |||
Module applied on the last feature. Default: (1, 2, 3, 6). | |||
""" | |||
def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): | |||
super(PSP, self).__init__(input_transform='multiple_select', **kwargs) | |||
# PSP Module | |||
self.psp_modules = PPM( | |||
pool_scales, | |||
self.in_channels[-1], | |||
self.channels, | |||
conv_cfg=self.conv_cfg, | |||
norm_cfg=self.norm_cfg, | |||
act_cfg=self.act_cfg, | |||
align_corners=self.align_corners) | |||
self.bottleneck = ConvModule( | |||
self.in_channels[-1] + len(pool_scales) * self.channels, | |||
self.channels, | |||
3, | |||
padding=1, | |||
conv_cfg=self.conv_cfg, | |||
norm_cfg=self.norm_cfg, | |||
act_cfg=self.act_cfg) | |||
def psp_forward(self, inputs): | |||
"""Forward function of PSP module.""" | |||
x = inputs[-1] | |||
psp_outs = [x] | |||
psp_outs.extend(self.psp_modules(x)) | |||
psp_outs = torch.cat(psp_outs, dim=1) | |||
output = self.bottleneck(psp_outs) | |||
return output | |||
def forward(self, inputs): | |||
"""Forward function.""" | |||
inputs = self._transform_inputs(inputs) | |||
return self.psp_forward(inputs) |
@@ -0,0 +1,53 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os.path as osp | |||
import numpy as np | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.models.cv.image_depth_estimation.networks.newcrf_depth import \ | |||
NewCRFDepth | |||
from modelscope.outputs import OutputKeys | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
@MODELS.register_module( | |||
Tasks.image_depth_estimation, module_name=Models.newcrfs_depth_estimation) | |||
class DepthEstimation(TorchModel): | |||
def __init__(self, model_dir: str, **kwargs): | |||
"""str -- model file root.""" | |||
super().__init__(model_dir, **kwargs) | |||
# build model | |||
self.model = NewCRFDepth( | |||
version='large07', inv_depth=False, max_depth=10) | |||
# load model | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
checkpoint = torch.load(model_path) | |||
state_dict = {} | |||
for k in checkpoint['model'].keys(): | |||
if k.startswith('module.'): | |||
state_dict[k[7:]] = checkpoint['model'][k] | |||
else: | |||
state_dict[k] = checkpoint['model'][k] | |||
self.model.load_state_dict(state_dict) | |||
self.model.eval() | |||
def forward(self, Inputs): | |||
return self.model(Inputs['imgs']) | |||
def postprocess(self, Inputs): | |||
depth_result = Inputs | |||
results = {OutputKeys.DEPTHS: depth_result} | |||
return results | |||
def inference(self, data): | |||
results = self.forward(data) | |||
return results |
@@ -25,7 +25,14 @@ def seg_resize(input, | |||
'the output would more aligned if ' | |||
f'input size {(input_h, input_w)} is `x+1` and ' | |||
f'out size {(output_h, output_w)} is `nx+1`') | |||
return F.interpolate(input, size, scale_factor, mode, align_corners) | |||
try: | |||
return F.interpolate(input, size, scale_factor, mode, align_corners) | |||
except ValueError: | |||
if isinstance(size, tuple): | |||
if len(size) == 3: | |||
size = size[:2] | |||
return F.interpolate(input, size, scale_factor, mode, align_corners) | |||
def add_prefix(inputs, prefix): | |||
@@ -1,3 +1,4 @@ | |||
# The implementation is adopted from U-2-Net, made publicly available under the Apache 2.0 License | |||
# source code avaiable via https://github.com/xuebinqin/U-2-Net | |||
from .senet import SENet | |||
from .u2net import U2NET |
@@ -0,0 +1,187 @@ | |||
# Implementation in this file is modified based on Res2Net-PretrainedModels | |||
# Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License | |||
# publicly avaialbe at https://github.com/Res2Net/Res2Net-PretrainedModels/blob/master/res2net_v1b.py | |||
import math | |||
import torch | |||
import torch.nn as nn | |||
__all__ = ['Res2Net', 'res2net50_v1b_26w_4s'] | |||
class Bottle2neck(nn.Module): | |||
expansion = 4 | |||
def __init__(self, | |||
inplanes, | |||
planes, | |||
stride=1, | |||
downsample=None, | |||
baseWidth=26, | |||
scale=4, | |||
stype='normal'): | |||
""" Constructor | |||
Args: | |||
inplanes: input channel dimensionality | |||
planes: output channel dimensionality | |||
stride: conv stride. Replaces pooling layer. | |||
downsample: None when stride = 1 | |||
baseWidth: basic width of conv3x3 | |||
scale: number of scale. | |||
type: 'normal': normal set. 'stage': first block of a new stage. | |||
""" | |||
super(Bottle2neck, self).__init__() | |||
width = int(math.floor(planes * (baseWidth / 64.0))) | |||
self.conv1 = nn.Conv2d( | |||
inplanes, width * scale, kernel_size=1, bias=False) | |||
self.bn1 = nn.BatchNorm2d(width * scale) | |||
if scale == 1: | |||
self.nums = 1 | |||
else: | |||
self.nums = scale - 1 | |||
if stype == 'stage': | |||
self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1) | |||
convs = [] | |||
bns = [] | |||
for i in range(self.nums): | |||
convs.append( | |||
nn.Conv2d( | |||
width, | |||
width, | |||
kernel_size=3, | |||
stride=stride, | |||
padding=1, | |||
bias=False)) | |||
bns.append(nn.BatchNorm2d(width)) | |||
self.convs = nn.ModuleList(convs) | |||
self.bns = nn.ModuleList(bns) | |||
self.conv3 = nn.Conv2d( | |||
width * scale, planes * self.expansion, kernel_size=1, bias=False) | |||
self.bn3 = nn.BatchNorm2d(planes * self.expansion) | |||
self.relu = nn.ReLU(inplace=True) | |||
self.downsample = downsample | |||
self.stype = stype | |||
self.scale = scale | |||
self.width = width | |||
def forward(self, x): | |||
residual = x | |||
out = self.conv1(x) | |||
out = self.bn1(out) | |||
out = self.relu(out) | |||
spx = torch.split(out, self.width, 1) | |||
for i in range(self.nums): | |||
if i == 0 or self.stype == 'stage': | |||
sp = spx[i] | |||
else: | |||
sp = sp + spx[i] | |||
sp = self.convs[i](sp) | |||
sp = self.relu(self.bns[i](sp)) | |||
if i == 0: | |||
out = sp | |||
else: | |||
out = torch.cat((out, sp), 1) | |||
if self.scale != 1 and self.stype == 'normal': | |||
out = torch.cat((out, spx[self.nums]), 1) | |||
elif self.scale != 1 and self.stype == 'stage': | |||
out = torch.cat((out, self.pool(spx[self.nums])), 1) | |||
out = self.conv3(out) | |||
out = self.bn3(out) | |||
if self.downsample is not None: | |||
residual = self.downsample(x) | |||
out += residual | |||
out = self.relu(out) | |||
return out | |||
class Res2Net(nn.Module): | |||
def __init__(self, block, layers, baseWidth=26, scale=4, num_classes=1000): | |||
self.inplanes = 64 | |||
super(Res2Net, self).__init__() | |||
self.baseWidth = baseWidth | |||
self.scale = scale | |||
self.conv1 = nn.Sequential( | |||
nn.Conv2d(3, 32, 3, 2, 1, bias=False), nn.BatchNorm2d(32), | |||
nn.ReLU(inplace=True), nn.Conv2d(32, 32, 3, 1, 1, bias=False), | |||
nn.BatchNorm2d(32), nn.ReLU(inplace=True), | |||
nn.Conv2d(32, 64, 3, 1, 1, bias=False)) | |||
self.bn1 = nn.BatchNorm2d(64) | |||
self.relu = nn.ReLU() | |||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||
self.layer1 = self._make_layer(block, 64, layers[0]) | |||
self.layer2 = self._make_layer(block, 128, layers[1], stride=2) | |||
self.layer3 = self._make_layer(block, 256, layers[2], stride=2) | |||
self.layer4 = self._make_layer(block, 512, layers[3], stride=2) | |||
self.avgpool = nn.AdaptiveAvgPool2d(1) | |||
self.fc = nn.Linear(512 * block.expansion, num_classes) | |||
for m in self.modules(): | |||
if isinstance(m, nn.Conv2d): | |||
nn.init.kaiming_normal_( | |||
m.weight, mode='fan_out', nonlinearity='relu') | |||
elif isinstance(m, nn.BatchNorm2d): | |||
nn.init.constant_(m.weight, 1) | |||
nn.init.constant_(m.bias, 0) | |||
def _make_layer(self, block, planes, blocks, stride=1): | |||
downsample = None | |||
if stride != 1 or self.inplanes != planes * block.expansion: | |||
downsample = nn.Sequential( | |||
nn.AvgPool2d( | |||
kernel_size=stride, | |||
stride=stride, | |||
ceil_mode=True, | |||
count_include_pad=False), | |||
nn.Conv2d( | |||
self.inplanes, | |||
planes * block.expansion, | |||
kernel_size=1, | |||
stride=1, | |||
bias=False), | |||
nn.BatchNorm2d(planes * block.expansion), | |||
) | |||
layers = [] | |||
layers.append( | |||
block( | |||
self.inplanes, | |||
planes, | |||
stride, | |||
downsample=downsample, | |||
stype='stage', | |||
baseWidth=self.baseWidth, | |||
scale=self.scale)) | |||
self.inplanes = planes * block.expansion | |||
for i in range(1, blocks): | |||
layers.append( | |||
block( | |||
self.inplanes, | |||
planes, | |||
baseWidth=self.baseWidth, | |||
scale=self.scale)) | |||
return nn.Sequential(*layers) | |||
def forward(self, x): | |||
x = self.conv1(x) | |||
x = self.bn1(x) | |||
x = self.relu(x) | |||
x = self.maxpool(x) | |||
x = self.layer1(x) | |||
x = self.layer2(x) | |||
x = self.layer3(x) | |||
x = self.layer4(x) | |||
x = self.avgpool(x) | |||
x = x.view(x.size(0), -1) | |||
x = self.fc(x) | |||
return x | |||
def res2net50_v1b_26w_4s(backbone_path, pretrained=False, **kwargs): | |||
"""Constructs a Res2Net-50_v1b_26w_4s lib. | |||
Args: | |||
pretrained (bool): If True, returns a lib pre-trained on ImageNet | |||
""" | |||
model = Res2Net(Bottle2neck, [3, 4, 6, 3], baseWidth=26, scale=4, **kwargs) | |||
if pretrained: | |||
model_state = torch.load(backbone_path) | |||
model.load_state_dict(model_state) | |||
return model |
@@ -0,0 +1,6 @@ | |||
# Implementation in this file is modified based on Res2Net-PretrainedModels | |||
# Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License | |||
# publicly avaialbe at https://github.com/Res2Net/Res2Net-PretrainedModels/blob/master/res2net_v1b.py | |||
from .Res2Net_v1b import res2net50_v1b_26w_4s | |||
__all__ = ['res2net50_v1b_26w_4s'] |
@@ -0,0 +1,178 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from .utils import ConvBNReLU | |||
class AreaLayer(nn.Module): | |||
def __init__(self, in_channel, out_channel): | |||
super(AreaLayer, self).__init__() | |||
self.lbody = nn.Sequential( | |||
nn.Conv2d(out_channel, out_channel, 1), | |||
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True)) | |||
self.hbody = nn.Sequential( | |||
nn.Conv2d(in_channel, out_channel, 1), nn.BatchNorm2d(out_channel), | |||
nn.ReLU(inplace=True)) | |||
self.body = nn.Sequential( | |||
nn.Conv2d(2 * out_channel, out_channel, 3, 1, 1), | |||
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True), | |||
nn.Conv2d(out_channel, out_channel, 3, 1, 1), | |||
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True), | |||
nn.Conv2d(out_channel, 1, 1)) | |||
def forward(self, xl, xh): | |||
xl1 = self.lbody(xl) | |||
xl1 = F.interpolate( | |||
xl1, size=xh.size()[2:], mode='bilinear', align_corners=True) | |||
xh1 = self.hbody(xh) | |||
x = torch.cat((xl1, xh1), dim=1) | |||
x_out = self.body(x) | |||
return x_out | |||
class EdgeLayer(nn.Module): | |||
def __init__(self, in_channel, out_channel): | |||
super(EdgeLayer, self).__init__() | |||
self.lbody = nn.Sequential( | |||
nn.Conv2d(out_channel, out_channel, 1), | |||
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True)) | |||
self.hbody = nn.Sequential( | |||
nn.Conv2d(in_channel, out_channel, 1), nn.BatchNorm2d(out_channel), | |||
nn.ReLU(inplace=True)) | |||
self.bodye = nn.Sequential( | |||
nn.Conv2d(2 * out_channel, out_channel, 3, 1, 1), | |||
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True), | |||
nn.Conv2d(out_channel, out_channel, 3, 1, 1), | |||
nn.BatchNorm2d(out_channel), nn.ReLU(inplace=True), | |||
nn.Conv2d(out_channel, 1, 1)) | |||
def forward(self, xl, xh): | |||
xl1 = self.lbody(xl) | |||
xh1 = self.hbody(xh) | |||
xh1 = F.interpolate( | |||
xh1, size=xl.size()[2:], mode='bilinear', align_corners=True) | |||
x = torch.cat((xl1, xh1), dim=1) | |||
x_out = self.bodye(x) | |||
return x_out | |||
class EBlock(nn.Module): | |||
def __init__(self, inchs, outchs): | |||
super(EBlock, self).__init__() | |||
self.elayer = nn.Sequential( | |||
ConvBNReLU(inchs + 1, outchs, kernel_size=3, padding=1, stride=1), | |||
ConvBNReLU(outchs, outchs, 1)) | |||
self.salayer = nn.Sequential( | |||
nn.Conv2d(2, 1, 3, 1, 1, bias=False), | |||
nn.BatchNorm2d(1, momentum=0.01), nn.Sigmoid()) | |||
def forward(self, x, edgeAtten): | |||
x = torch.cat((x, edgeAtten), dim=1) | |||
ex = self.elayer(x) | |||
ex_max = torch.max(ex, 1, keepdim=True)[0] | |||
ex_mean = torch.mean(ex, dim=1, keepdim=True) | |||
xei_compress = torch.cat((ex_max, ex_mean), dim=1) | |||
scale = self.salayer(xei_compress) | |||
x_out = ex * scale | |||
return x_out | |||
class StructureE(nn.Module): | |||
def __init__(self, inchs, outchs, EM): | |||
super(StructureE, self).__init__() | |||
self.ne_modules = int(inchs / EM) | |||
NM = int(outchs / self.ne_modules) | |||
elayes = [] | |||
for i in range(self.ne_modules): | |||
emblock = EBlock(EM, NM) | |||
elayes.append(emblock) | |||
self.emlayes = nn.ModuleList(elayes) | |||
self.body = nn.Sequential( | |||
ConvBNReLU(outchs, outchs, 3, 1, 1), ConvBNReLU(outchs, outchs, 1)) | |||
def forward(self, x, edgeAtten): | |||
if edgeAtten.size() != x.size(): | |||
edgeAtten = F.interpolate( | |||
edgeAtten, x.size()[2:], mode='bilinear', align_corners=False) | |||
xx = torch.chunk(x, self.ne_modules, dim=1) | |||
efeas = [] | |||
for i in range(self.ne_modules): | |||
xei = self.emlayes[i](xx[i], edgeAtten) | |||
efeas.append(xei) | |||
efeas = torch.cat(efeas, dim=1) | |||
x_out = self.body(efeas) | |||
return x_out | |||
class ABlock(nn.Module): | |||
def __init__(self, inchs, outchs, k): | |||
super(ABlock, self).__init__() | |||
self.alayer = nn.Sequential( | |||
ConvBNReLU(inchs, outchs, k, 1, k // 2), | |||
ConvBNReLU(outchs, outchs, 1)) | |||
self.arlayer = nn.Sequential( | |||
ConvBNReLU(inchs, outchs, k, 1, k // 2), | |||
ConvBNReLU(outchs, outchs, 1)) | |||
self.fusion = ConvBNReLU(2 * outchs, outchs, 1) | |||
def forward(self, x, areaAtten): | |||
xa = x * areaAtten | |||
xra = x * (1 - areaAtten) | |||
xout = self.fusion(torch.cat((xa, xra), dim=1)) | |||
return xout | |||
class AMFusion(nn.Module): | |||
def __init__(self, inchs, outchs, AM): | |||
super(AMFusion, self).__init__() | |||
self.k = [3, 3, 5, 5] | |||
self.conv_up = ConvBNReLU(inchs, outchs, 3, 1, 1) | |||
self.up = nn.Upsample( | |||
scale_factor=2, mode='bilinear', align_corners=True) | |||
self.na_modules = int(outchs / AM) | |||
alayers = [] | |||
for i in range(self.na_modules): | |||
layer = ABlock(AM, AM, self.k[i]) | |||
alayers.append(layer) | |||
self.alayers = nn.ModuleList(alayers) | |||
self.fusion_0 = ConvBNReLU(outchs, outchs, 3, 1, 1) | |||
self.fusion_e = nn.Sequential( | |||
nn.Conv2d( | |||
outchs, outchs, kernel_size=(3, 1), padding=(1, 0), | |||
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True), | |||
nn.Conv2d( | |||
outchs, outchs, kernel_size=(1, 3), padding=(0, 1), | |||
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True)) | |||
self.fusion_e1 = nn.Sequential( | |||
nn.Conv2d( | |||
outchs, outchs, kernel_size=(5, 1), padding=(2, 0), | |||
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True), | |||
nn.Conv2d( | |||
outchs, outchs, kernel_size=(1, 5), padding=(0, 2), | |||
bias=False), nn.BatchNorm2d(outchs), nn.ReLU(inplace=True)) | |||
self.fusion = ConvBNReLU(3 * outchs, outchs, 1) | |||
def forward(self, xl, xh, xhm): | |||
xh1 = self.up(self.conv_up(xh)) | |||
x = xh1 + xl | |||
xm = self.up(torch.sigmoid(xhm)) | |||
xx = torch.chunk(x, self.na_modules, dim=1) | |||
xxmids = [] | |||
for i in range(self.na_modules): | |||
xi = self.alayers[i](xx[i], xm) | |||
xxmids.append(xi) | |||
xfea = torch.cat(xxmids, dim=1) | |||
x0 = self.fusion_0(xfea) | |||
x1 = self.fusion_e(xfea) | |||
x2 = self.fusion_e1(xfea) | |||
x_out = self.fusion(torch.cat((x0, x1, x2), dim=1)) | |||
return x_out |
@@ -0,0 +1,74 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from .backbone import res2net50_v1b_26w_4s as res2net | |||
from .modules import AMFusion, AreaLayer, EdgeLayer, StructureE | |||
from .utils import ASPP, CBAM, ConvBNReLU | |||
class SENet(nn.Module): | |||
def __init__(self, backbone_path=None, pretrained=False): | |||
super(SENet, self).__init__() | |||
resnet50 = res2net(backbone_path, pretrained) | |||
self.layer0_1 = nn.Sequential(resnet50.conv1, resnet50.bn1, | |||
resnet50.relu) | |||
self.maxpool = resnet50.maxpool | |||
self.layer1 = resnet50.layer1 | |||
self.layer2 = resnet50.layer2 | |||
self.layer3 = resnet50.layer3 | |||
self.layer4 = resnet50.layer4 | |||
self.aspp3 = ASPP(1024, 256) | |||
self.aspp4 = ASPP(2048, 256) | |||
self.cbblock3 = CBAM(inchs=256, kernel_size=5) | |||
self.cbblock4 = CBAM(inchs=256, kernel_size=5) | |||
self.up = nn.Upsample( | |||
mode='bilinear', scale_factor=2, align_corners=False) | |||
self.conv_up = ConvBNReLU(512, 512, 1) | |||
self.aux_edge = EdgeLayer(512, 256) | |||
self.aux_area = AreaLayer(512, 256) | |||
self.layer1_enhance = StructureE(256, 128, 128) | |||
self.layer2_enhance = StructureE(512, 256, 128) | |||
self.layer3_decoder = AMFusion(512, 256, 128) | |||
self.layer2_decoder = AMFusion(256, 128, 128) | |||
self.out_conv_8 = nn.Conv2d(256, 1, 1) | |||
self.out_conv_4 = nn.Conv2d(128, 1, 1) | |||
def forward(self, x): | |||
layer0 = self.layer0_1(x) | |||
layer0s = self.maxpool(layer0) | |||
layer1 = self.layer1(layer0s) | |||
layer2 = self.layer2(layer1) | |||
layer3 = self.layer3(layer2) | |||
layer4 = self.layer4(layer3) | |||
layer3_eh = self.cbblock3(self.aspp3(layer3)) | |||
layer4_eh = self.cbblock4(self.aspp4(layer4)) | |||
layer34 = self.conv_up( | |||
torch.cat((self.up(layer4_eh), layer3_eh), dim=1)) | |||
edge_atten = self.aux_edge(layer1, layer34) | |||
area_atten = self.aux_area(layer1, layer34) | |||
edge_atten_ = torch.sigmoid(edge_atten) | |||
layer1_eh = self.layer1_enhance(layer1, edge_atten_) | |||
layer2_eh = self.layer2_enhance(layer2, edge_atten_) | |||
layer2_fu = self.layer3_decoder(layer2_eh, layer34, area_atten) | |||
out_8 = self.out_conv_8(layer2_fu) | |||
layer1_fu = self.layer2_decoder(layer1_eh, layer2_fu, out_8) | |||
out_4 = self.out_conv_4(layer1_fu) | |||
out_16 = F.interpolate( | |||
area_atten, | |||
size=x.size()[2:], | |||
mode='bilinear', | |||
align_corners=False) | |||
out_8 = F.interpolate( | |||
out_8, size=x.size()[2:], mode='bilinear', align_corners=False) | |||
out_4 = F.interpolate( | |||
out_4, size=x.size()[2:], mode='bilinear', align_corners=False) | |||
edge_out = F.interpolate( | |||
edge_atten_, | |||
size=x.size()[2:], | |||
mode='bilinear', | |||
align_corners=False) | |||
return out_4.sigmoid(), out_8.sigmoid(), out_16.sigmoid(), edge_out |
@@ -0,0 +1,105 @@ | |||
# Implementation in this file is modified based on deeplabv3 | |||
# Originally MIT license,publicly avaialbe at https://github.com/fregu856/deeplabv3/blob/master/model/aspp.py | |||
# Implementation in this file is modified based on attention-module | |||
# Originally MIT license,publicly avaialbe at https://github.com/Jongchan/attention-module/blob/master/MODELS/cbam.py | |||
import torch | |||
import torch.nn as nn | |||
class ConvBNReLU(nn.Module): | |||
def __init__(self, | |||
inplanes, | |||
planes, | |||
kernel_size=3, | |||
stride=1, | |||
padding=0, | |||
dilation=1, | |||
bias=False): | |||
super(ConvBNReLU, self).__init__() | |||
self.block = nn.Sequential( | |||
nn.Conv2d( | |||
inplanes, | |||
planes, | |||
kernel_size, | |||
stride=stride, | |||
padding=padding, | |||
dilation=dilation, | |||
bias=bias), nn.BatchNorm2d(planes), nn.ReLU(inplace=True)) | |||
def forward(self, x): | |||
return self.block(x) | |||
class ASPP(nn.Module): | |||
def __init__(self, in_dim, out_dim): | |||
super(ASPP, self).__init__() | |||
mid_dim = 128 | |||
self.conv1 = ConvBNReLU(in_dim, mid_dim, kernel_size=1) | |||
self.conv2 = ConvBNReLU( | |||
in_dim, mid_dim, kernel_size=3, padding=2, dilation=2) | |||
self.conv3 = ConvBNReLU( | |||
in_dim, mid_dim, kernel_size=3, padding=5, dilation=5) | |||
self.conv4 = ConvBNReLU( | |||
in_dim, mid_dim, kernel_size=3, padding=7, dilation=7) | |||
self.conv5 = ConvBNReLU(in_dim, mid_dim, kernel_size=1, padding=0) | |||
self.fuse = ConvBNReLU(5 * mid_dim, out_dim, 3, 1, 1) | |||
self.global_pooling = nn.AdaptiveAvgPool2d(1) | |||
def forward(self, x): | |||
conv1 = self.conv1(x) | |||
conv2 = self.conv2(x) | |||
conv3 = self.conv3(x) | |||
conv4 = self.conv4(x) | |||
xg = self.conv5(self.global_pooling(x)) | |||
conv5 = nn.Upsample((x.shape[2], x.shape[3]), mode='nearest')(xg) | |||
return self.fuse(torch.cat((conv1, conv2, conv3, conv4, conv5), 1)) | |||
class ChannelAttention(nn.Module): | |||
def __init__(self, inchs, ratio=16): | |||
super(ChannelAttention, self).__init__() | |||
self.avg_pool = nn.AdaptiveAvgPool2d(1) | |||
self.max_pool = nn.AdaptiveMaxPool2d(1) | |||
self.fc = nn.Sequential( | |||
nn.Conv2d(inchs, inchs // 16, 1, bias=False), nn.ReLU(), | |||
nn.Conv2d(inchs // 16, inchs, 1, bias=False)) | |||
self.sigmoid = nn.Sigmoid() | |||
def forward(self, x): | |||
avg_out = self.fc(self.avg_pool(x)) | |||
max_out = self.fc(self.max_pool(x)) | |||
out = avg_out + max_out | |||
return self.sigmoid(out) | |||
class SpatialAttention(nn.Module): | |||
def __init__(self, kernel_size=7): | |||
super(SpatialAttention, self).__init__() | |||
self.conv1 = nn.Conv2d( | |||
2, 1, kernel_size, padding=kernel_size // 2, bias=False) | |||
self.sigmoid = nn.Sigmoid() | |||
def forward(self, x): | |||
avg_out = torch.mean(x, dim=1, keepdim=True) | |||
max_out, _ = torch.max(x, dim=1, keepdim=True) | |||
x = torch.cat([avg_out, max_out], dim=1) | |||
x = self.conv1(x) | |||
return self.sigmoid(x) | |||
class CBAM(nn.Module): | |||
def __init__(self, inchs, kernel_size=7): | |||
super().__init__() | |||
self.calayer = ChannelAttention(inchs=inchs) | |||
self.saLayer = SpatialAttention(kernel_size=kernel_size) | |||
def forward(self, x): | |||
xca = self.calayer(x) * x | |||
xsa = self.saLayer(xca) * xca | |||
return xsa |
@@ -2,7 +2,6 @@ | |||
import os.path as osp | |||
import cv2 | |||
import numpy as np | |||
import torch | |||
from PIL import Image | |||
from torchvision import transforms | |||
@@ -10,8 +9,9 @@ from torchvision import transforms | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base.base_torch_model import TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.config import Config | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .models import U2NET | |||
from .models import U2NET, SENet | |||
@MODELS.register_module( | |||
@@ -22,13 +22,25 @@ class SalientDetection(TorchModel): | |||
"""str -- model file root.""" | |||
super().__init__(model_dir, *args, **kwargs) | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
self.model = U2NET(3, 1) | |||
self.norm_mean = [0.485, 0.456, 0.406] | |||
self.norm_std = [0.229, 0.224, 0.225] | |||
self.norm_size = (320, 320) | |||
config_path = osp.join(model_dir, 'config.py') | |||
if osp.exists(config_path) is False: | |||
self.model = U2NET(3, 1) | |||
else: | |||
self.model = SENet(backbone_path=None, pretrained=False) | |||
config = Config.from_file(config_path) | |||
self.norm_mean = config.norm_mean | |||
self.norm_std = config.norm_std | |||
self.norm_size = config.norm_size | |||
checkpoint = torch.load(model_path, map_location='cpu') | |||
self.transform_input = transforms.Compose([ | |||
transforms.Resize((320, 320)), | |||
transforms.Resize(self.norm_size), | |||
transforms.ToTensor(), | |||
transforms.Normalize( | |||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
transforms.Normalize(mean=self.norm_mean, std=self.norm_std) | |||
]) | |||
self.model.load_state_dict(checkpoint) | |||
self.model.eval() | |||
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
from typing import TYPE_CHECKING | |||
@@ -1,10 +1,11 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import copy | |||
from .darknet import CSPDarknet | |||
from .tinynas import load_tinynas_net | |||
from .tinynas_csp import load_tinynas_net as load_tinynas_net_csp | |||
from .tinynas_res import load_tinynas_net as load_tinynas_net_res | |||
def build_backbone(cfg): | |||
@@ -12,5 +13,7 @@ def build_backbone(cfg): | |||
name = backbone_cfg.pop('name') | |||
if name == 'CSPDarknet': | |||
return CSPDarknet(**backbone_cfg) | |||
elif name == 'TinyNAS': | |||
return load_tinynas_net(backbone_cfg) | |||
elif name == 'TinyNAS_csp': | |||
return load_tinynas_net_csp(backbone_cfg) | |||
elif name == 'TinyNAS_res': | |||
return load_tinynas_net_res(backbone_cfg) |
@@ -1,12 +1,11 @@ | |||
# Copyright (c) Megvii Inc. All rights reserved. | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
import torch | |||
from torch import nn | |||
from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer, | |||
SPPBottleneck) | |||
from modelscope.models.cv.tinynas_detection.core.base_ops import ( | |||
BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck) | |||
class CSPDarknet(nn.Module): | |||
@@ -1,359 +0,0 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
import torch | |||
import torch.nn as nn | |||
from modelscope.utils.file_utils import read_file | |||
from ..core.base_ops import Focus, SPPBottleneck, get_activation | |||
from ..core.repvgg_block import RepVggBlock | |||
class ConvKXBN(nn.Module): | |||
def __init__(self, in_c, out_c, kernel_size, stride): | |||
super(ConvKXBN, self).__init__() | |||
self.conv1 = nn.Conv2d( | |||
in_c, | |||
out_c, | |||
kernel_size, | |||
stride, (kernel_size - 1) // 2, | |||
groups=1, | |||
bias=False) | |||
self.bn1 = nn.BatchNorm2d(out_c) | |||
def forward(self, x): | |||
return self.bn1(self.conv1(x)) | |||
class ConvKXBNRELU(nn.Module): | |||
def __init__(self, in_c, out_c, kernel_size, stride, act='silu'): | |||
super(ConvKXBNRELU, self).__init__() | |||
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride) | |||
if act is None: | |||
self.activation_function = torch.relu | |||
else: | |||
self.activation_function = get_activation(act) | |||
def forward(self, x): | |||
output = self.conv(x) | |||
return self.activation_function(output) | |||
class ResConvK1KX(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
force_resproj=False, | |||
act='silu', | |||
reparam=False): | |||
super(ResConvK1KX, self).__init__() | |||
self.stride = stride | |||
self.conv1 = ConvKXBN(in_c, btn_c, 1, 1) | |||
if not reparam: | |||
self.conv2 = ConvKXBN(btn_c, out_c, 3, stride) | |||
else: | |||
self.conv2 = RepVggBlock( | |||
btn_c, out_c, kernel_size, stride, act='identity') | |||
if act is None: | |||
self.activation_function = torch.relu | |||
else: | |||
self.activation_function = get_activation(act) | |||
if stride == 2: | |||
self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2) | |||
else: | |||
self.residual_downsample = nn.Identity() | |||
if in_c != out_c or force_resproj: | |||
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) | |||
else: | |||
self.residual_proj = nn.Identity() | |||
def forward(self, x): | |||
if self.stride != 2: | |||
reslink = self.residual_downsample(x) | |||
reslink = self.residual_proj(reslink) | |||
output = x | |||
output = self.conv1(output) | |||
output = self.activation_function(output) | |||
output = self.conv2(output) | |||
if self.stride != 2: | |||
output = output + reslink | |||
output = self.activation_function(output) | |||
return output | |||
class SuperResConvK1KX(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
num_blocks, | |||
with_spp=False, | |||
act='silu', | |||
reparam=False): | |||
super(SuperResConvK1KX, self).__init__() | |||
if act is None: | |||
self.act = torch.relu | |||
else: | |||
self.act = get_activation(act) | |||
self.block_list = nn.ModuleList() | |||
for block_id in range(num_blocks): | |||
if block_id == 0: | |||
in_channels = in_c | |||
out_channels = out_c | |||
this_stride = stride | |||
force_resproj = False # as a part of CSPLayer, DO NOT need this flag | |||
this_kernel_size = kernel_size | |||
else: | |||
in_channels = out_c | |||
out_channels = out_c | |||
this_stride = 1 | |||
force_resproj = False | |||
this_kernel_size = kernel_size | |||
the_block = ResConvK1KX( | |||
in_channels, | |||
out_channels, | |||
btn_c, | |||
this_kernel_size, | |||
this_stride, | |||
force_resproj, | |||
act=act, | |||
reparam=reparam) | |||
self.block_list.append(the_block) | |||
if block_id == 0 and with_spp: | |||
self.block_list.append( | |||
SPPBottleneck(out_channels, out_channels)) | |||
def forward(self, x): | |||
output = x | |||
for block in self.block_list: | |||
output = block(output) | |||
return output | |||
class ResConvKXKX(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
force_resproj=False, | |||
act='silu'): | |||
super(ResConvKXKX, self).__init__() | |||
self.stride = stride | |||
if self.stride == 2: | |||
self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act) | |||
else: | |||
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1) | |||
self.conv2 = RepVggBlock( | |||
btn_c, out_c, kernel_size, stride, act='identity') | |||
if act is None: | |||
self.activation_function = torch.relu | |||
else: | |||
self.activation_function = get_activation(act) | |||
if stride == 2: | |||
self.residual_downsample = nn.AvgPool2d( | |||
kernel_size=2, stride=2) | |||
else: | |||
self.residual_downsample = nn.Identity() | |||
if in_c != out_c or force_resproj: | |||
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) | |||
else: | |||
self.residual_proj = nn.Identity() | |||
def forward(self, x): | |||
if self.stride == 2: | |||
return self.downsampler(x) | |||
reslink = self.residual_downsample(x) | |||
reslink = self.residual_proj(reslink) | |||
output = x | |||
output = self.conv1(output) | |||
output = self.activation_function(output) | |||
output = self.conv2(output) | |||
output = output + reslink | |||
output = self.activation_function(output) | |||
return output | |||
class SuperResConvKXKX(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
num_blocks, | |||
with_spp=False, | |||
act='silu'): | |||
super(SuperResConvKXKX, self).__init__() | |||
if act is None: | |||
self.act = torch.relu | |||
else: | |||
self.act = get_activation(act) | |||
self.block_list = nn.ModuleList() | |||
for block_id in range(num_blocks): | |||
if block_id == 0: | |||
in_channels = in_c | |||
out_channels = out_c | |||
this_stride = stride | |||
force_resproj = False # as a part of CSPLayer, DO NOT need this flag | |||
this_kernel_size = kernel_size | |||
else: | |||
in_channels = out_c | |||
out_channels = out_c | |||
this_stride = 1 | |||
force_resproj = False | |||
this_kernel_size = kernel_size | |||
the_block = ResConvKXKX( | |||
in_channels, | |||
out_channels, | |||
btn_c, | |||
this_kernel_size, | |||
this_stride, | |||
force_resproj, | |||
act=act) | |||
self.block_list.append(the_block) | |||
if block_id == 0 and with_spp: | |||
self.block_list.append( | |||
SPPBottleneck(out_channels, out_channels)) | |||
def forward(self, x): | |||
output = x | |||
for block in self.block_list: | |||
output = block(output) | |||
return output | |||
class TinyNAS(nn.Module): | |||
def __init__(self, | |||
structure_info=None, | |||
out_indices=[0, 1, 2, 4, 5], | |||
out_channels=[None, None, 128, 256, 512], | |||
with_spp=False, | |||
use_focus=False, | |||
need_conv1=True, | |||
act='silu', | |||
reparam=False): | |||
super(TinyNAS, self).__init__() | |||
assert len(out_indices) == len(out_channels) | |||
self.out_indices = out_indices | |||
self.need_conv1 = need_conv1 | |||
self.block_list = nn.ModuleList() | |||
if need_conv1: | |||
self.conv1_list = nn.ModuleList() | |||
for idx, block_info in enumerate(structure_info): | |||
the_block_class = block_info['class'] | |||
if the_block_class == 'ConvKXBNRELU': | |||
if use_focus: | |||
the_block = Focus( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['k'], | |||
act=act) | |||
else: | |||
the_block = ConvKXBNRELU( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['k'], | |||
block_info['s'], | |||
act=act) | |||
self.block_list.append(the_block) | |||
elif the_block_class == 'SuperResConvK1KX': | |||
spp = with_spp if idx == len(structure_info) - 1 else False | |||
the_block = SuperResConvK1KX( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['btn'], | |||
block_info['k'], | |||
block_info['s'], | |||
block_info['L'], | |||
spp, | |||
act=act, | |||
reparam=reparam) | |||
self.block_list.append(the_block) | |||
elif the_block_class == 'SuperResConvKXKX': | |||
spp = with_spp if idx == len(structure_info) - 1 else False | |||
the_block = SuperResConvKXKX( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['btn'], | |||
block_info['k'], | |||
block_info['s'], | |||
block_info['L'], | |||
spp, | |||
act=act) | |||
self.block_list.append(the_block) | |||
if need_conv1: | |||
if idx in self.out_indices and out_channels[ | |||
self.out_indices.index(idx)] is not None: | |||
self.conv1_list.append( | |||
nn.Conv2d(block_info['out'], | |||
out_channels[self.out_indices.index(idx)], | |||
1)) | |||
else: | |||
self.conv1_list.append(None) | |||
def init_weights(self, pretrain=None): | |||
pass | |||
def forward(self, x): | |||
output = x | |||
stage_feature_list = [] | |||
for idx, block in enumerate(self.block_list): | |||
output = block(output) | |||
if idx in self.out_indices: | |||
if self.need_conv1 and self.conv1_list[idx] is not None: | |||
true_out = self.conv1_list[idx](output) | |||
stage_feature_list.append(true_out) | |||
else: | |||
stage_feature_list.append(output) | |||
return stage_feature_list | |||
def load_tinynas_net(backbone_cfg): | |||
# load masternet model to path | |||
import ast | |||
net_structure_str = read_file(backbone_cfg.structure_file) | |||
struct_str = ''.join([x.strip() for x in net_structure_str]) | |||
struct_info = ast.literal_eval(struct_str) | |||
for layer in struct_info: | |||
if 'nbitsA' in layer: | |||
del layer['nbitsA'] | |||
if 'nbitsW' in layer: | |||
del layer['nbitsW'] | |||
model = TinyNAS( | |||
structure_info=struct_info, | |||
out_indices=backbone_cfg.out_indices, | |||
out_channels=backbone_cfg.out_channels, | |||
with_spp=backbone_cfg.with_spp, | |||
use_focus=backbone_cfg.use_focus, | |||
act=backbone_cfg.act, | |||
need_conv1=backbone_cfg.need_conv1, | |||
reparam=backbone_cfg.reparam) | |||
return model |
@@ -0,0 +1,295 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors, and available | |||
# at https://github.com/tinyvision/damo-yolo. | |||
import torch | |||
import torch.nn as nn | |||
from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv, | |||
SPPBottleneck, | |||
get_activation) | |||
from modelscope.utils.file_utils import read_file | |||
class ConvKXBN(nn.Module): | |||
def __init__(self, in_c, out_c, kernel_size, stride): | |||
super(ConvKXBN, self).__init__() | |||
self.conv1 = nn.Conv2d( | |||
in_c, | |||
out_c, | |||
kernel_size, | |||
stride, (kernel_size - 1) // 2, | |||
groups=1, | |||
bias=False) | |||
self.bn1 = nn.BatchNorm2d(out_c) | |||
def forward(self, x): | |||
return self.bn1(self.conv1(x)) | |||
class ConvKXBNRELU(nn.Module): | |||
def __init__(self, in_c, out_c, kernel_size, stride, act='silu'): | |||
super(ConvKXBNRELU, self).__init__() | |||
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride) | |||
if act is None: | |||
self.activation_function = torch.relu | |||
else: | |||
self.activation_function = get_activation(act) | |||
def forward(self, x): | |||
output = self.conv(x) | |||
return self.activation_function(output) | |||
class ResConvBlock(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
act='silu', | |||
reparam=False, | |||
block_type='k1kx'): | |||
super(ResConvBlock, self).__init__() | |||
self.stride = stride | |||
if block_type == 'k1kx': | |||
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size=1, stride=1) | |||
else: | |||
self.conv1 = ConvKXBN( | |||
in_c, btn_c, kernel_size=kernel_size, stride=1) | |||
if not reparam: | |||
self.conv2 = ConvKXBN(btn_c, out_c, kernel_size, stride) | |||
else: | |||
self.conv2 = RepConv( | |||
btn_c, out_c, kernel_size, stride, act='identity') | |||
self.activation_function = get_activation(act) | |||
if in_c != out_c and stride != 2: | |||
self.residual_proj = ConvKXBN(in_c, out_c, kernel_size=1, stride=1) | |||
else: | |||
self.residual_proj = None | |||
def forward(self, x): | |||
if self.residual_proj is not None: | |||
reslink = self.residual_proj(x) | |||
else: | |||
reslink = x | |||
x = self.conv1(x) | |||
x = self.activation_function(x) | |||
x = self.conv2(x) | |||
if self.stride != 2: | |||
x = x + reslink | |||
x = self.activation_function(x) | |||
return x | |||
class CSPStem(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
stride, | |||
kernel_size, | |||
num_blocks, | |||
act='silu', | |||
reparam=False, | |||
block_type='k1kx'): | |||
super(CSPStem, self).__init__() | |||
self.in_channels = in_c | |||
self.out_channels = out_c | |||
self.stride = stride | |||
if self.stride == 2: | |||
self.num_blocks = num_blocks - 1 | |||
else: | |||
self.num_blocks = num_blocks | |||
self.kernel_size = kernel_size | |||
self.act = act | |||
self.block_type = block_type | |||
out_c = out_c // 2 | |||
if act is None: | |||
self.act = torch.relu | |||
else: | |||
self.act = get_activation(act) | |||
self.block_list = nn.ModuleList() | |||
for block_id in range(self.num_blocks): | |||
if self.stride == 1 and block_id == 0: | |||
in_c = in_c // 2 | |||
else: | |||
in_c = out_c | |||
the_block = ResConvBlock( | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride=1, | |||
act=act, | |||
reparam=reparam, | |||
block_type=block_type) | |||
self.block_list.append(the_block) | |||
def forward(self, x): | |||
output = x | |||
for block in self.block_list: | |||
output = block(output) | |||
return output | |||
class TinyNAS(nn.Module): | |||
def __init__(self, | |||
structure_info=None, | |||
out_indices=[2, 3, 4], | |||
with_spp=False, | |||
use_focus=False, | |||
act='silu', | |||
reparam=False): | |||
super(TinyNAS, self).__init__() | |||
self.out_indices = out_indices | |||
self.block_list = nn.ModuleList() | |||
self.stride_list = [] | |||
for idx, block_info in enumerate(structure_info): | |||
the_block_class = block_info['class'] | |||
if the_block_class == 'ConvKXBNRELU': | |||
if use_focus and idx == 0: | |||
the_block = Focus( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['k'], | |||
act=act) | |||
else: | |||
the_block = ConvKXBNRELU( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['k'], | |||
block_info['s'], | |||
act=act) | |||
elif the_block_class == 'SuperResConvK1KX': | |||
the_block = CSPStem( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['btn'], | |||
block_info['s'], | |||
block_info['k'], | |||
block_info['L'], | |||
act=act, | |||
reparam=reparam, | |||
block_type='k1kx') | |||
elif the_block_class == 'SuperResConvKXKX': | |||
the_block = CSPStem( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['btn'], | |||
block_info['s'], | |||
block_info['k'], | |||
block_info['L'], | |||
act=act, | |||
reparam=reparam, | |||
block_type='kxkx') | |||
else: | |||
raise NotImplementedError | |||
self.block_list.append(the_block) | |||
self.csp_stage = nn.ModuleList() | |||
self.csp_stage.append(self.block_list[0]) | |||
self.csp_stage.append(CSPWrapper(self.block_list[1])) | |||
self.csp_stage.append(CSPWrapper(self.block_list[2])) | |||
self.csp_stage.append( | |||
CSPWrapper((self.block_list[3], self.block_list[4]))) | |||
self.csp_stage.append( | |||
CSPWrapper(self.block_list[5], with_spp=with_spp)) | |||
del self.block_list | |||
def init_weights(self, pretrain=None): | |||
pass | |||
def forward(self, x): | |||
output = x | |||
stage_feature_list = [] | |||
for idx, block in enumerate(self.csp_stage): | |||
output = block(output) | |||
if idx in self.out_indices: | |||
stage_feature_list.append(output) | |||
return stage_feature_list | |||
class CSPWrapper(nn.Module): | |||
def __init__(self, convstem, act='relu', reparam=False, with_spp=False): | |||
super(CSPWrapper, self).__init__() | |||
self.with_spp = with_spp | |||
if isinstance(convstem, tuple): | |||
in_c = convstem[0].in_channels | |||
out_c = convstem[-1].out_channels | |||
hidden_dim = convstem[0].out_channels // 2 | |||
_convstem = nn.ModuleList() | |||
for modulelist in convstem: | |||
for layer in modulelist.block_list: | |||
_convstem.append(layer) | |||
else: | |||
in_c = convstem.in_channels | |||
out_c = convstem.out_channels | |||
hidden_dim = out_c // 2 | |||
_convstem = convstem.block_list | |||
self.convstem = nn.ModuleList() | |||
for layer in _convstem: | |||
self.convstem.append(layer) | |||
self.act = get_activation(act) | |||
self.downsampler = ConvKXBNRELU( | |||
in_c, hidden_dim * 2, 3, 2, act=self.act) | |||
if self.with_spp: | |||
self.spp = SPPBottleneck(hidden_dim * 2, hidden_dim * 2) | |||
if len(self.convstem) > 0: | |||
self.conv_start = ConvKXBNRELU( | |||
hidden_dim * 2, hidden_dim, 1, 1, act=self.act) | |||
self.conv_shortcut = ConvKXBNRELU( | |||
hidden_dim * 2, out_c // 2, 1, 1, act=self.act) | |||
self.conv_fuse = ConvKXBNRELU(out_c, out_c, 1, 1, act=self.act) | |||
def forward(self, x): | |||
x = self.downsampler(x) | |||
if self.with_spp: | |||
x = self.spp(x) | |||
if len(self.convstem) > 0: | |||
shortcut = self.conv_shortcut(x) | |||
x = self.conv_start(x) | |||
for block in self.convstem: | |||
x = block(x) | |||
x = torch.cat((x, shortcut), dim=1) | |||
x = self.conv_fuse(x) | |||
return x | |||
def load_tinynas_net(backbone_cfg): | |||
# load masternet model to path | |||
import ast | |||
net_structure_str = read_file(backbone_cfg.structure_file) | |||
struct_str = ''.join([x.strip() for x in net_structure_str]) | |||
struct_info = ast.literal_eval(struct_str) | |||
for layer in struct_info: | |||
if 'nbitsA' in layer: | |||
del layer['nbitsA'] | |||
if 'nbitsW' in layer: | |||
del layer['nbitsW'] | |||
model = TinyNAS( | |||
structure_info=struct_info, | |||
out_indices=backbone_cfg.out_indices, | |||
with_spp=backbone_cfg.with_spp, | |||
use_focus=backbone_cfg.use_focus, | |||
act=backbone_cfg.act, | |||
reparam=backbone_cfg.reparam) | |||
return model |
@@ -0,0 +1,238 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors, and available | |||
# at https://github.com/tinyvision/damo-yolo. | |||
import torch | |||
import torch.nn as nn | |||
from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv, | |||
SPPBottleneck, | |||
get_activation) | |||
from modelscope.utils.file_utils import read_file | |||
class ConvKXBN(nn.Module): | |||
def __init__(self, in_c, out_c, kernel_size, stride): | |||
super(ConvKXBN, self).__init__() | |||
self.conv1 = nn.Conv2d( | |||
in_c, | |||
out_c, | |||
kernel_size, | |||
stride, (kernel_size - 1) // 2, | |||
groups=1, | |||
bias=False) | |||
self.bn1 = nn.BatchNorm2d(out_c) | |||
def forward(self, x): | |||
return self.bn1(self.conv1(x)) | |||
class ConvKXBNRELU(nn.Module): | |||
def __init__(self, in_c, out_c, kernel_size, stride, act='silu'): | |||
super(ConvKXBNRELU, self).__init__() | |||
self.conv = ConvKXBN(in_c, out_c, kernel_size, stride) | |||
if act is None: | |||
self.activation_function = torch.relu | |||
else: | |||
self.activation_function = get_activation(act) | |||
def forward(self, x): | |||
output = self.conv(x) | |||
return self.activation_function(output) | |||
class ResConvBlock(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
act='silu', | |||
reparam=False, | |||
block_type='k1kx'): | |||
super(ResConvBlock, self).__init__() | |||
self.stride = stride | |||
if block_type == 'k1kx': | |||
self.conv1 = ConvKXBN(in_c, btn_c, kernel_size=1, stride=1) | |||
else: | |||
self.conv1 = ConvKXBN( | |||
in_c, btn_c, kernel_size=kernel_size, stride=1) | |||
if not reparam: | |||
self.conv2 = ConvKXBN(btn_c, out_c, kernel_size, stride) | |||
else: | |||
self.conv2 = RepConv( | |||
btn_c, out_c, kernel_size, stride, act='identity') | |||
self.activation_function = get_activation(act) | |||
if in_c != out_c and stride != 2: | |||
self.residual_proj = ConvKXBN(in_c, out_c, 1, 1) | |||
else: | |||
self.residual_proj = None | |||
def forward(self, x): | |||
if self.residual_proj is not None: | |||
reslink = self.residual_proj(x) | |||
else: | |||
reslink = x | |||
x = self.conv1(x) | |||
x = self.activation_function(x) | |||
x = self.conv2(x) | |||
if self.stride != 2: | |||
x = x + reslink | |||
x = self.activation_function(x) | |||
return x | |||
class SuperResStem(nn.Module): | |||
def __init__(self, | |||
in_c, | |||
out_c, | |||
btn_c, | |||
kernel_size, | |||
stride, | |||
num_blocks, | |||
with_spp=False, | |||
act='silu', | |||
reparam=False, | |||
block_type='k1kx'): | |||
super(SuperResStem, self).__init__() | |||
if act is None: | |||
self.act = torch.relu | |||
else: | |||
self.act = get_activation(act) | |||
self.block_list = nn.ModuleList() | |||
for block_id in range(num_blocks): | |||
if block_id == 0: | |||
in_channels = in_c | |||
out_channels = out_c | |||
this_stride = stride | |||
this_kernel_size = kernel_size | |||
else: | |||
in_channels = out_c | |||
out_channels = out_c | |||
this_stride = 1 | |||
this_kernel_size = kernel_size | |||
the_block = ResConvBlock( | |||
in_channels, | |||
out_channels, | |||
btn_c, | |||
this_kernel_size, | |||
this_stride, | |||
act=act, | |||
reparam=reparam, | |||
block_type=block_type) | |||
self.block_list.append(the_block) | |||
if block_id == 0 and with_spp: | |||
self.block_list.append( | |||
SPPBottleneck(out_channels, out_channels)) | |||
def forward(self, x): | |||
output = x | |||
for block in self.block_list: | |||
output = block(output) | |||
return output | |||
class TinyNAS(nn.Module): | |||
def __init__(self, | |||
structure_info=None, | |||
out_indices=[2, 4, 5], | |||
with_spp=False, | |||
use_focus=False, | |||
act='silu', | |||
reparam=False): | |||
super(TinyNAS, self).__init__() | |||
self.out_indices = out_indices | |||
self.block_list = nn.ModuleList() | |||
for idx, block_info in enumerate(structure_info): | |||
the_block_class = block_info['class'] | |||
if the_block_class == 'ConvKXBNRELU': | |||
if use_focus: | |||
the_block = Focus( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['k'], | |||
act=act) | |||
else: | |||
the_block = ConvKXBNRELU( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['k'], | |||
block_info['s'], | |||
act=act) | |||
self.block_list.append(the_block) | |||
elif the_block_class == 'SuperResConvK1KX': | |||
spp = with_spp if idx == len(structure_info) - 1 else False | |||
the_block = SuperResStem( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['btn'], | |||
block_info['k'], | |||
block_info['s'], | |||
block_info['L'], | |||
spp, | |||
act=act, | |||
reparam=reparam, | |||
block_type='k1kx') | |||
self.block_list.append(the_block) | |||
elif the_block_class == 'SuperResConvKXKX': | |||
spp = with_spp if idx == len(structure_info) - 1 else False | |||
the_block = SuperResStem( | |||
block_info['in'], | |||
block_info['out'], | |||
block_info['btn'], | |||
block_info['k'], | |||
block_info['s'], | |||
block_info['L'], | |||
spp, | |||
act=act, | |||
reparam=reparam, | |||
block_type='kxkx') | |||
self.block_list.append(the_block) | |||
else: | |||
raise NotImplementedError | |||
def init_weights(self, pretrain=None): | |||
pass | |||
def forward(self, x): | |||
output = x | |||
stage_feature_list = [] | |||
for idx, block in enumerate(self.block_list): | |||
output = block(output) | |||
if idx in self.out_indices: | |||
stage_feature_list.append(output) | |||
return stage_feature_list | |||
def load_tinynas_net(backbone_cfg): | |||
# load masternet model to path | |||
import ast | |||
net_structure_str = read_file(backbone_cfg.structure_file) | |||
struct_str = ''.join([x.strip() for x in net_structure_str]) | |||
struct_info = ast.literal_eval(struct_str) | |||
for layer in struct_info: | |||
if 'nbitsA' in layer: | |||
del layer['nbitsA'] | |||
if 'nbitsW' in layer: | |||
del layer['nbitsW'] | |||
model = TinyNAS( | |||
structure_info=struct_info, | |||
out_indices=backbone_cfg.out_indices, | |||
with_spp=backbone_cfg.with_spp, | |||
use_focus=backbone_cfg.use_focus, | |||
act=backbone_cfg.act, | |||
reparam=backbone_cfg.reparam) | |||
return model |
@@ -1,2 +1,2 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. |
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import math | |||
import torch | |||
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import numpy as np | |||
import torch | |||
@@ -0,0 +1,435 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
class SiLU(nn.Module): | |||
"""export-friendly version of nn.SiLU()""" | |||
@staticmethod | |||
def forward(x): | |||
return x * torch.sigmoid(x) | |||
class Swish(nn.Module): | |||
def __init__(self, inplace=True): | |||
super(Swish, self).__init__() | |||
self.inplace = inplace | |||
def forward(self, x): | |||
if self.inplace: | |||
x.mul_(F.sigmoid(x)) | |||
return x | |||
else: | |||
return x * F.sigmoid(x) | |||
def get_activation(name='silu', inplace=True): | |||
if name is None: | |||
return nn.Identity() | |||
if isinstance(name, str): | |||
if name == 'silu': | |||
module = nn.SiLU(inplace=inplace) | |||
elif name == 'relu': | |||
module = nn.ReLU(inplace=inplace) | |||
elif name == 'lrelu': | |||
module = nn.LeakyReLU(0.1, inplace=inplace) | |||
elif name == 'swish': | |||
module = Swish(inplace=inplace) | |||
elif name == 'hardsigmoid': | |||
module = nn.Hardsigmoid(inplace=inplace) | |||
elif name == 'identity': | |||
module = nn.Identity() | |||
else: | |||
raise AttributeError('Unsupported act type: {}'.format(name)) | |||
return module | |||
elif isinstance(name, nn.Module): | |||
return name | |||
else: | |||
raise AttributeError('Unsupported act type: {}'.format(name)) | |||
def get_norm(name, out_channels, inplace=True): | |||
if name == 'bn': | |||
module = nn.BatchNorm2d(out_channels) | |||
else: | |||
raise NotImplementedError | |||
return module | |||
class ConvBNAct(nn.Module): | |||
"""A Conv2d -> Batchnorm -> silu/leaky relu block""" | |||
def __init__( | |||
self, | |||
in_channels, | |||
out_channels, | |||
ksize, | |||
stride=1, | |||
groups=1, | |||
bias=False, | |||
act='silu', | |||
norm='bn', | |||
reparam=False, | |||
): | |||
super().__init__() | |||
# same padding | |||
pad = (ksize - 1) // 2 | |||
self.conv = nn.Conv2d( | |||
in_channels, | |||
out_channels, | |||
kernel_size=ksize, | |||
stride=stride, | |||
padding=pad, | |||
groups=groups, | |||
bias=bias, | |||
) | |||
if norm is not None: | |||
self.bn = get_norm(norm, out_channels, inplace=True) | |||
if act is not None: | |||
self.act = get_activation(act, inplace=True) | |||
self.with_norm = norm is not None | |||
self.with_act = act is not None | |||
def forward(self, x): | |||
x = self.conv(x) | |||
if self.with_norm: | |||
x = self.bn(x) | |||
if self.with_act: | |||
x = self.act(x) | |||
return x | |||
def fuseforward(self, x): | |||
return self.act(self.conv(x)) | |||
class SPPBottleneck(nn.Module): | |||
"""Spatial pyramid pooling layer used in YOLOv3-SPP""" | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_sizes=(5, 9, 13), | |||
activation='silu'): | |||
super().__init__() | |||
hidden_channels = in_channels // 2 | |||
self.conv1 = ConvBNAct( | |||
in_channels, hidden_channels, 1, stride=1, act=activation) | |||
self.m = nn.ModuleList([ | |||
nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) | |||
for ks in kernel_sizes | |||
]) | |||
conv2_channels = hidden_channels * (len(kernel_sizes) + 1) | |||
self.conv2 = ConvBNAct( | |||
conv2_channels, out_channels, 1, stride=1, act=activation) | |||
def forward(self, x): | |||
x = self.conv1(x) | |||
x = torch.cat([x] + [m(x) for m in self.m], dim=1) | |||
x = self.conv2(x) | |||
return x | |||
class Focus(nn.Module): | |||
"""Focus width and height information into channel space.""" | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
ksize=1, | |||
stride=1, | |||
act='silu'): | |||
super().__init__() | |||
self.conv = ConvBNAct( | |||
in_channels * 4, out_channels, ksize, stride, act=act) | |||
def forward(self, x): | |||
# shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) | |||
patch_top_left = x[..., ::2, ::2] | |||
patch_top_right = x[..., ::2, 1::2] | |||
patch_bot_left = x[..., 1::2, ::2] | |||
patch_bot_right = x[..., 1::2, 1::2] | |||
x = torch.cat( | |||
( | |||
patch_top_left, | |||
patch_bot_left, | |||
patch_top_right, | |||
patch_bot_right, | |||
), | |||
dim=1, | |||
) | |||
return self.conv(x) | |||
class BasicBlock_3x3_Reverse(nn.Module): | |||
def __init__(self, | |||
ch_in, | |||
ch_hidden_ratio, | |||
ch_out, | |||
act='relu', | |||
shortcut=True): | |||
super(BasicBlock_3x3_Reverse, self).__init__() | |||
assert ch_in == ch_out | |||
ch_hidden = int(ch_in * ch_hidden_ratio) | |||
self.conv1 = ConvBNAct(ch_hidden, ch_out, 3, stride=1, act=act) | |||
self.conv2 = RepConv(ch_in, ch_hidden, 3, stride=1, act=act) | |||
self.shortcut = shortcut | |||
def forward(self, x): | |||
y = self.conv2(x) | |||
y = self.conv1(y) | |||
if self.shortcut: | |||
return x + y | |||
else: | |||
return y | |||
class SPP(nn.Module): | |||
def __init__( | |||
self, | |||
ch_in, | |||
ch_out, | |||
k, | |||
pool_size, | |||
act='swish', | |||
): | |||
super(SPP, self).__init__() | |||
self.pool = [] | |||
for i, size in enumerate(pool_size): | |||
pool = nn.MaxPool2d( | |||
kernel_size=size, stride=1, padding=size // 2, ceil_mode=False) | |||
self.add_module('pool{}'.format(i), pool) | |||
self.pool.append(pool) | |||
self.conv = ConvBNAct(ch_in, ch_out, k, act=act) | |||
def forward(self, x): | |||
outs = [x] | |||
for pool in self.pool: | |||
outs.append(pool(x)) | |||
y = torch.cat(outs, axis=1) | |||
y = self.conv(y) | |||
return y | |||
class CSPStage(nn.Module): | |||
def __init__(self, | |||
block_fn, | |||
ch_in, | |||
ch_hidden_ratio, | |||
ch_out, | |||
n, | |||
act='swish', | |||
spp=False): | |||
super(CSPStage, self).__init__() | |||
split_ratio = 2 | |||
ch_first = int(ch_out // split_ratio) | |||
ch_mid = int(ch_out - ch_first) | |||
self.conv1 = ConvBNAct(ch_in, ch_first, 1, act=act) | |||
self.conv2 = ConvBNAct(ch_in, ch_mid, 1, act=act) | |||
self.convs = nn.Sequential() | |||
next_ch_in = ch_mid | |||
for i in range(n): | |||
if block_fn == 'BasicBlock_3x3_Reverse': | |||
self.convs.add_module( | |||
str(i), | |||
BasicBlock_3x3_Reverse( | |||
next_ch_in, | |||
ch_hidden_ratio, | |||
ch_mid, | |||
act=act, | |||
shortcut=True)) | |||
else: | |||
raise NotImplementedError | |||
if i == (n - 1) // 2 and spp: | |||
self.convs.add_module( | |||
'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) | |||
next_ch_in = ch_mid | |||
self.conv3 = ConvBNAct(ch_mid * n + ch_first, ch_out, 1, act=act) | |||
def forward(self, x): | |||
y1 = self.conv1(x) | |||
y2 = self.conv2(x) | |||
mid_out = [y1] | |||
for conv in self.convs: | |||
y2 = conv(y2) | |||
mid_out.append(y2) | |||
y = torch.cat(mid_out, axis=1) | |||
y = self.conv3(y) | |||
return y | |||
def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1): | |||
'''Basic cell for rep-style block, including conv and bn''' | |||
result = nn.Sequential() | |||
result.add_module( | |||
'conv', | |||
nn.Conv2d( | |||
in_channels=in_channels, | |||
out_channels=out_channels, | |||
kernel_size=kernel_size, | |||
stride=stride, | |||
padding=padding, | |||
groups=groups, | |||
bias=False)) | |||
result.add_module('bn', nn.BatchNorm2d(num_features=out_channels)) | |||
return result | |||
class RepConv(nn.Module): | |||
'''RepConv is a basic rep-style block, including training and deploy status | |||
Code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py | |||
''' | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_size=3, | |||
stride=1, | |||
padding=1, | |||
dilation=1, | |||
groups=1, | |||
padding_mode='zeros', | |||
deploy=False, | |||
act='relu', | |||
norm=None): | |||
super(RepConv, self).__init__() | |||
self.deploy = deploy | |||
self.groups = groups | |||
self.in_channels = in_channels | |||
self.out_channels = out_channels | |||
assert kernel_size == 3 | |||
assert padding == 1 | |||
padding_11 = padding - kernel_size // 2 | |||
if isinstance(act, str): | |||
self.nonlinearity = get_activation(act) | |||
else: | |||
self.nonlinearity = act | |||
if deploy: | |||
self.rbr_reparam = nn.Conv2d( | |||
in_channels=in_channels, | |||
out_channels=out_channels, | |||
kernel_size=kernel_size, | |||
stride=stride, | |||
padding=padding, | |||
dilation=dilation, | |||
groups=groups, | |||
bias=True, | |||
padding_mode=padding_mode) | |||
else: | |||
self.rbr_identity = None | |||
self.rbr_dense = conv_bn( | |||
in_channels=in_channels, | |||
out_channels=out_channels, | |||
kernel_size=kernel_size, | |||
stride=stride, | |||
padding=padding, | |||
groups=groups) | |||
self.rbr_1x1 = conv_bn( | |||
in_channels=in_channels, | |||
out_channels=out_channels, | |||
kernel_size=1, | |||
stride=stride, | |||
padding=padding_11, | |||
groups=groups) | |||
def forward(self, inputs): | |||
'''Forward process''' | |||
if hasattr(self, 'rbr_reparam'): | |||
return self.nonlinearity(self.rbr_reparam(inputs)) | |||
if self.rbr_identity is None: | |||
id_out = 0 | |||
else: | |||
id_out = self.rbr_identity(inputs) | |||
return self.nonlinearity( | |||
self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out) | |||
def get_equivalent_kernel_bias(self): | |||
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) | |||
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) | |||
kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) | |||
return kernel3x3 + self._pad_1x1_to_3x3_tensor( | |||
kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid | |||
def _pad_1x1_to_3x3_tensor(self, kernel1x1): | |||
if kernel1x1 is None: | |||
return 0 | |||
else: | |||
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) | |||
def _fuse_bn_tensor(self, branch): | |||
if branch is None: | |||
return 0, 0 | |||
if isinstance(branch, nn.Sequential): | |||
kernel = branch.conv.weight | |||
running_mean = branch.bn.running_mean | |||
running_var = branch.bn.running_var | |||
gamma = branch.bn.weight | |||
beta = branch.bn.bias | |||
eps = branch.bn.eps | |||
else: | |||
assert isinstance(branch, nn.BatchNorm2d) | |||
if not hasattr(self, 'id_tensor'): | |||
input_dim = self.in_channels // self.groups | |||
kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), | |||
dtype=np.float32) | |||
for i in range(self.in_channels): | |||
kernel_value[i, i % input_dim, 1, 1] = 1 | |||
self.id_tensor = torch.from_numpy(kernel_value).to( | |||
branch.weight.device) | |||
kernel = self.id_tensor | |||
running_mean = branch.running_mean | |||
running_var = branch.running_var | |||
gamma = branch.weight | |||
beta = branch.bias | |||
eps = branch.eps | |||
std = (running_var + eps).sqrt() | |||
t = (gamma / std).reshape(-1, 1, 1, 1) | |||
return kernel * t, beta - running_mean * gamma / std | |||
def switch_to_deploy(self): | |||
if hasattr(self, 'rbr_reparam'): | |||
return | |||
kernel, bias = self.get_equivalent_kernel_bias() | |||
self.rbr_reparam = nn.Conv2d( | |||
in_channels=self.rbr_dense.conv.in_channels, | |||
out_channels=self.rbr_dense.conv.out_channels, | |||
kernel_size=self.rbr_dense.conv.kernel_size, | |||
stride=self.rbr_dense.conv.stride, | |||
padding=self.rbr_dense.conv.padding, | |||
dilation=self.rbr_dense.conv.dilation, | |||
groups=self.rbr_dense.conv.groups, | |||
bias=True) | |||
self.rbr_reparam.weight.data = kernel | |||
self.rbr_reparam.bias.data = bias | |||
for para in self.parameters(): | |||
para.detach_() | |||
self.__delattr__('rbr_dense') | |||
self.__delattr__('rbr_1x1') | |||
if hasattr(self, 'rbr_identity'): | |||
self.__delattr__('rbr_identity') | |||
if hasattr(self, 'id_tensor'): | |||
self.__delattr__('id_tensor') | |||
self.deploy = True |
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import numpy as np | |||
import torch | |||
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import numpy as np | |||
import torch | |||
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import os.path as osp | |||
import pickle | |||
@@ -42,7 +42,7 @@ class SingleStageDetector(TorchModel): | |||
self.conf_thre = config.model.head.nms_conf_thre | |||
self.nms_thre = config.model.head.nms_iou_thre | |||
if self.cfg.model.backbone.name == 'TinyNAS': | |||
if 'TinyNAS' in self.cfg.model.backbone.name: | |||
self.cfg.model.backbone.structure_file = osp.join( | |||
model_dir, self.cfg.model.backbone.structure_file) | |||
self.backbone = build_backbone(self.cfg.model.backbone) | |||
@@ -1,9 +1,10 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import copy | |||
from .gfocal_v2_tiny import GFocalHead_Tiny | |||
from .zero_head import ZeroHead | |||
def build_head(cfg): | |||
@@ -12,5 +13,7 @@ def build_head(cfg): | |||
name = head_cfg.pop('name') | |||
if name == 'GFocalV2': | |||
return GFocalHead_Tiny(**head_cfg) | |||
elif name == 'ZeroHead': | |||
return ZeroHead(**head_cfg) | |||
else: | |||
raise NotImplementedError |
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import functools | |||
from functools import partial | |||
@@ -9,7 +9,8 @@ import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from ..core.base_ops import BaseConv, DWConv | |||
from modelscope.models.cv.tinynas_detection.core.base_ops import (BaseConv, | |||
DWConv) | |||
class Scale(nn.Module): | |||
@@ -0,0 +1,288 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors, and available | |||
# at https://github.com/tinyvision/damo-yolo. | |||
from functools import partial | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct | |||
class Scale(nn.Module): | |||
def __init__(self, scale=1.0): | |||
super(Scale, self).__init__() | |||
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) | |||
def forward(self, x): | |||
return x * self.scale | |||
def multi_apply(func, *args, **kwargs): | |||
pfunc = partial(func, **kwargs) if kwargs else func | |||
map_results = map(pfunc, *args) | |||
return tuple(map(list, zip(*map_results))) | |||
def distance2bbox(points, distance, max_shape=None): | |||
"""Decode distance prediction to bounding box. | |||
""" | |||
x1 = points[..., 0] - distance[..., 0] | |||
y1 = points[..., 1] - distance[..., 1] | |||
x2 = points[..., 0] + distance[..., 2] | |||
y2 = points[..., 1] + distance[..., 3] | |||
if max_shape is not None: | |||
x1 = x1.clamp(min=0, max=max_shape[1]) | |||
y1 = y1.clamp(min=0, max=max_shape[0]) | |||
x2 = x2.clamp(min=0, max=max_shape[1]) | |||
y2 = y2.clamp(min=0, max=max_shape[0]) | |||
return torch.stack([x1, y1, x2, y2], -1) | |||
def bbox2distance(points, bbox, max_dis=None, eps=0.1): | |||
"""Decode bounding box based on distances. | |||
""" | |||
left = points[:, 0] - bbox[:, 0] | |||
top = points[:, 1] - bbox[:, 1] | |||
right = bbox[:, 2] - points[:, 0] | |||
bottom = bbox[:, 3] - points[:, 1] | |||
if max_dis is not None: | |||
left = left.clamp(min=0, max=max_dis - eps) | |||
top = top.clamp(min=0, max=max_dis - eps) | |||
right = right.clamp(min=0, max=max_dis - eps) | |||
bottom = bottom.clamp(min=0, max=max_dis - eps) | |||
return torch.stack([left, top, right, bottom], -1) | |||
class Integral(nn.Module): | |||
"""A fixed layer for calculating integral result from distribution. | |||
""" | |||
def __init__(self, reg_max=16): | |||
super(Integral, self).__init__() | |||
self.reg_max = reg_max | |||
self.register_buffer('project', | |||
torch.linspace(0, self.reg_max, self.reg_max + 1)) | |||
def forward(self, x): | |||
"""Forward feature from the regression head to get integral result of | |||
bounding box location. | |||
""" | |||
b, hw, _, _ = x.size() | |||
x = x.reshape(b * hw * 4, self.reg_max + 1) | |||
y = self.project.type_as(x).unsqueeze(1) | |||
x = torch.matmul(x, y).reshape(b, hw, 4) | |||
return x | |||
class ZeroHead(nn.Module): | |||
"""Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality | |||
Estimation for Dense Object Detection. | |||
""" | |||
def __init__( | |||
self, | |||
num_classes, | |||
in_channels, | |||
stacked_convs=4, # 4 | |||
feat_channels=256, | |||
reg_max=12, | |||
strides=[8, 16, 32], | |||
norm='gn', | |||
act='relu', | |||
nms_conf_thre=0.05, | |||
nms_iou_thre=0.7, | |||
nms=True, | |||
**kwargs): | |||
self.in_channels = in_channels | |||
self.num_classes = num_classes | |||
self.stacked_convs = stacked_convs | |||
self.act = act | |||
self.strides = strides | |||
if stacked_convs == 0: | |||
feat_channels = in_channels | |||
if isinstance(feat_channels, list): | |||
self.feat_channels = feat_channels | |||
else: | |||
self.feat_channels = [feat_channels] * len(self.strides) | |||
# add 1 for keep consistance with former models | |||
self.cls_out_channels = num_classes + 1 | |||
self.reg_max = reg_max | |||
self.nms = nms | |||
self.nms_conf_thre = nms_conf_thre | |||
self.nms_iou_thre = nms_iou_thre | |||
self.feat_size = [torch.zeros(4) for _ in strides] | |||
super(ZeroHead, self).__init__() | |||
self.integral = Integral(self.reg_max) | |||
self._init_layers() | |||
def _build_not_shared_convs(self, in_channel, feat_channels): | |||
cls_convs = nn.ModuleList() | |||
reg_convs = nn.ModuleList() | |||
for i in range(self.stacked_convs): | |||
chn = feat_channels if i > 0 else in_channel | |||
kernel_size = 3 if i > 0 else 1 | |||
cls_convs.append( | |||
ConvBNAct( | |||
chn, | |||
feat_channels, | |||
kernel_size, | |||
stride=1, | |||
groups=1, | |||
norm='bn', | |||
act=self.act)) | |||
reg_convs.append( | |||
ConvBNAct( | |||
chn, | |||
feat_channels, | |||
kernel_size, | |||
stride=1, | |||
groups=1, | |||
norm='bn', | |||
act=self.act)) | |||
return cls_convs, reg_convs | |||
def _init_layers(self): | |||
"""Initialize layers of the head.""" | |||
self.cls_convs = nn.ModuleList() | |||
self.reg_convs = nn.ModuleList() | |||
for i in range(len(self.strides)): | |||
cls_convs, reg_convs = self._build_not_shared_convs( | |||
self.in_channels[i], self.feat_channels[i]) | |||
self.cls_convs.append(cls_convs) | |||
self.reg_convs.append(reg_convs) | |||
self.gfl_cls = nn.ModuleList([ | |||
nn.Conv2d( | |||
self.feat_channels[i], self.cls_out_channels, 3, padding=1) | |||
for i in range(len(self.strides)) | |||
]) | |||
self.gfl_reg = nn.ModuleList([ | |||
nn.Conv2d( | |||
self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1) | |||
for i in range(len(self.strides)) | |||
]) | |||
self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) | |||
def forward(self, xin, labels=None, imgs=None, aux_targets=None): | |||
if self.training: | |||
return NotImplementedError | |||
else: | |||
return self.forward_eval(xin=xin, labels=labels, imgs=imgs) | |||
def forward_eval(self, xin, labels=None, imgs=None): | |||
# prepare priors for label assignment and bbox decode | |||
if self.feat_size[0] != xin[0].shape: | |||
mlvl_priors_list = [ | |||
self.get_single_level_center_priors( | |||
xin[i].shape[0], | |||
xin[i].shape[-2:], | |||
stride, | |||
dtype=torch.float32, | |||
device=xin[0].device) | |||
for i, stride in enumerate(self.strides) | |||
] | |||
self.mlvl_priors = torch.cat(mlvl_priors_list, dim=1) | |||
self.feat_size[0] = xin[0].shape | |||
# forward for bboxes and classification prediction | |||
cls_scores, bbox_preds = multi_apply( | |||
self.forward_single, | |||
xin, | |||
self.cls_convs, | |||
self.reg_convs, | |||
self.gfl_cls, | |||
self.gfl_reg, | |||
self.scales, | |||
) | |||
cls_scores = torch.cat(cls_scores, dim=1)[:, :, :self.num_classes] | |||
bbox_preds = torch.cat(bbox_preds, dim=1) | |||
# batch bbox decode | |||
bbox_preds = self.integral(bbox_preds) * self.mlvl_priors[..., 2, None] | |||
bbox_preds = distance2bbox(self.mlvl_priors[..., :2], bbox_preds) | |||
res = torch.cat([bbox_preds, cls_scores[..., 0:self.num_classes]], | |||
dim=-1) | |||
return res | |||
def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, scale): | |||
"""Forward feature of a single scale level. | |||
""" | |||
cls_feat = x | |||
reg_feat = x | |||
for cls_conv, reg_conv in zip(cls_convs, reg_convs): | |||
cls_feat = cls_conv(cls_feat) | |||
reg_feat = reg_conv(reg_feat) | |||
bbox_pred = scale(gfl_reg(reg_feat)).float() | |||
N, C, H, W = bbox_pred.size() | |||
if self.training: | |||
bbox_before_softmax = bbox_pred.reshape(N, 4, self.reg_max + 1, H, | |||
W) | |||
bbox_before_softmax = bbox_before_softmax.flatten( | |||
start_dim=3).permute(0, 3, 1, 2) | |||
bbox_pred = F.softmax( | |||
bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2) | |||
cls_score = gfl_cls(cls_feat).sigmoid() | |||
cls_score = cls_score.flatten(start_dim=2).permute( | |||
0, 2, 1) # N, h*w, self.num_classes+1 | |||
bbox_pred = bbox_pred.flatten(start_dim=3).permute( | |||
0, 3, 1, 2) # N, h*w, 4, self.reg_max+1 | |||
if self.training: | |||
return cls_score, bbox_pred, bbox_before_softmax | |||
else: | |||
return cls_score, bbox_pred | |||
def get_single_level_center_priors(self, batch_size, featmap_size, stride, | |||
dtype, device): | |||
h, w = featmap_size | |||
x_range = (torch.arange(0, int(w), dtype=dtype, | |||
device=device)) * stride | |||
y_range = (torch.arange(0, int(h), dtype=dtype, | |||
device=device)) * stride | |||
x = x_range.repeat(h, 1) | |||
y = y_range.unsqueeze(-1).repeat(1, w) | |||
y = y.flatten() | |||
x = x.flatten() | |||
strides = x.new_full((x.shape[0], ), stride) | |||
priors = torch.stack([x, y, strides, strides], dim=-1) | |||
return priors.unsqueeze(0).repeat(batch_size, 1, 1) | |||
def sample(self, assign_result, gt_bboxes): | |||
pos_inds = torch.nonzero( | |||
assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() | |||
neg_inds = torch.nonzero( | |||
assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() | |||
pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 | |||
if gt_bboxes.numel() == 0: | |||
# hack for index error case | |||
assert pos_assigned_gt_inds.numel() == 0 | |||
pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4) | |||
else: | |||
if len(gt_bboxes.shape) < 2: | |||
gt_bboxes = gt_bboxes.view(-1, 4) | |||
pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] | |||
return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds |
@@ -1,10 +1,10 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import copy | |||
from .giraffe_fpn import GiraffeNeck | |||
from .giraffe_fpn_v2 import GiraffeNeckV2 | |||
from .giraffe_fpn_btn import GiraffeNeckV2 | |||
def build_neck(cfg): | |||
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import collections | |||
import itertools | |||
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import logging | |||
import math | |||
@@ -15,7 +15,8 @@ from timm import create_model | |||
from timm.models.layers import (Swish, create_conv2d, create_pool2d, | |||
get_act_layer) | |||
from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer | |||
from modelscope.models.cv.tinynas_detection.core.base_ops import ( | |||
CSPLayer, ShuffleBlock, ShuffleCSPLayer) | |||
from .giraffe_config import get_graph_config | |||
_ACT_LAYER = Swish | |||
@@ -0,0 +1,132 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
import torch | |||
import torch.nn as nn | |||
from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct, CSPStage | |||
class GiraffeNeckV2(nn.Module): | |||
def __init__( | |||
self, | |||
depth=1.0, | |||
hidden_ratio=1.0, | |||
in_features=[2, 3, 4], | |||
in_channels=[256, 512, 1024], | |||
out_channels=[256, 512, 1024], | |||
act='silu', | |||
spp=False, | |||
block_name='BasicBlock', | |||
): | |||
super().__init__() | |||
self.in_features = in_features | |||
self.in_channels = in_channels | |||
self.out_channels = out_channels | |||
Conv = ConvBNAct | |||
self.upsample = nn.Upsample(scale_factor=2, mode='nearest') | |||
# node x3: input x0, x1 | |||
self.bu_conv13 = Conv(in_channels[1], in_channels[1], 3, 2, act=act) | |||
self.merge_3 = CSPStage( | |||
block_name, | |||
in_channels[1] + in_channels[2], | |||
hidden_ratio, | |||
in_channels[2], | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
# node x4: input x1, x2, x3 | |||
self.bu_conv24 = Conv(in_channels[0], in_channels[0], 3, 2, act=act) | |||
self.merge_4 = CSPStage( | |||
block_name, | |||
in_channels[0] + in_channels[1] + in_channels[2], | |||
hidden_ratio, | |||
in_channels[1], | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
# node x5: input x2, x4 | |||
self.merge_5 = CSPStage( | |||
block_name, | |||
in_channels[1] + in_channels[0], | |||
hidden_ratio, | |||
out_channels[0], | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
# node x7: input x4, x5 | |||
self.bu_conv57 = Conv(out_channels[0], out_channels[0], 3, 2, act=act) | |||
self.merge_7 = CSPStage( | |||
block_name, | |||
out_channels[0] + in_channels[1], | |||
hidden_ratio, | |||
out_channels[1], | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
# node x6: input x3, x4, x7 | |||
self.bu_conv46 = Conv(in_channels[1], in_channels[1], 3, 2, act=act) | |||
self.bu_conv76 = Conv(out_channels[1], out_channels[1], 3, 2, act=act) | |||
self.merge_6 = CSPStage( | |||
block_name, | |||
in_channels[1] + out_channels[1] + in_channels[2], | |||
hidden_ratio, | |||
out_channels[2], | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
def init_weights(self): | |||
pass | |||
def forward(self, out_features): | |||
""" | |||
Args: | |||
inputs: input images. | |||
Returns: | |||
Tuple[Tensor]: FPN feature. | |||
""" | |||
# backbone | |||
[x2, x1, x0] = out_features | |||
# node x3 | |||
x13 = self.bu_conv13(x1) | |||
x3 = torch.cat([x0, x13], 1) | |||
x3 = self.merge_3(x3) | |||
# node x4 | |||
x34 = self.upsample(x3) | |||
x24 = self.bu_conv24(x2) | |||
x4 = torch.cat([x1, x24, x34], 1) | |||
x4 = self.merge_4(x4) | |||
# node x5 | |||
x45 = self.upsample(x4) | |||
x5 = torch.cat([x2, x45], 1) | |||
x5 = self.merge_5(x5) | |||
# node x8 | |||
# x8 = x5 | |||
# node x7 | |||
x57 = self.bu_conv57(x5) | |||
x7 = torch.cat([x4, x57], 1) | |||
x7 = self.merge_7(x7) | |||
# node x6 | |||
x46 = self.bu_conv46(x4) | |||
x76 = self.bu_conv76(x7) | |||
x6 = torch.cat([x3, x46, x76], 1) | |||
x6 = self.merge_6(x6) | |||
outputs = (x5, x7, x6) | |||
return outputs |
@@ -1,200 +0,0 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
import torch | |||
import torch.nn as nn | |||
from ..core.base_ops import BaseConv, CSPLayer, DWConv | |||
from ..core.neck_ops import CSPStage | |||
class GiraffeNeckV2(nn.Module): | |||
def __init__( | |||
self, | |||
depth=1.0, | |||
width=1.0, | |||
in_channels=[256, 512, 1024], | |||
out_channels=[256, 512, 1024], | |||
depthwise=False, | |||
act='silu', | |||
spp=True, | |||
reparam_mode=True, | |||
block_name='BasicBlock', | |||
): | |||
super().__init__() | |||
self.in_channels = in_channels | |||
Conv = DWConv if depthwise else BaseConv | |||
reparam_mode = reparam_mode | |||
self.upsample = nn.Upsample(scale_factor=2, mode='nearest') | |||
# node x3: input x0, x1 | |||
self.bu_conv13 = Conv( | |||
int(in_channels[1] * width), | |||
int(in_channels[1] * width), | |||
3, | |||
2, | |||
act=act) | |||
if reparam_mode: | |||
self.merge_3 = CSPStage( | |||
block_name, | |||
int((in_channels[1] + in_channels[2]) * width), | |||
int(in_channels[2] * width), | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
else: | |||
self.merge_3 = CSPLayer( | |||
int((in_channels[1] + in_channels[2]) * width), | |||
int(in_channels[2] * width), | |||
round(3 * depth), | |||
False, | |||
depthwise=depthwise, | |||
act=act) | |||
# node x4: input x1, x2, x3 | |||
self.bu_conv24 = Conv( | |||
int(in_channels[0] * width), | |||
int(in_channels[0] * width), | |||
3, | |||
2, | |||
act=act) | |||
if reparam_mode: | |||
self.merge_4 = CSPStage( | |||
block_name, | |||
int((in_channels[0] + in_channels[1] + in_channels[2]) | |||
* width), | |||
int(in_channels[1] * width), | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
else: | |||
self.merge_4 = CSPLayer( | |||
int((in_channels[0] + in_channels[1] + in_channels[2]) | |||
* width), | |||
int(in_channels[1] * width), | |||
round(3 * depth), | |||
False, | |||
depthwise=depthwise, | |||
act=act) | |||
# node x5: input x2, x4 | |||
if reparam_mode: | |||
self.merge_5 = CSPStage( | |||
block_name, | |||
int((in_channels[1] + in_channels[0]) * width), | |||
int(out_channels[0] * width), | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
else: | |||
self.merge_5 = CSPLayer( | |||
int((in_channels[1] + in_channels[0]) * width), | |||
int(out_channels[0] * width), | |||
round(3 * depth), | |||
False, | |||
depthwise=depthwise, | |||
act=act) | |||
# node x7: input x4, x5 | |||
self.bu_conv57 = Conv( | |||
int(out_channels[0] * width), | |||
int(out_channels[0] * width), | |||
3, | |||
2, | |||
act=act) | |||
if reparam_mode: | |||
self.merge_7 = CSPStage( | |||
block_name, | |||
int((out_channels[0] + in_channels[1]) * width), | |||
int(out_channels[1] * width), | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
else: | |||
self.merge_7 = CSPLayer( | |||
int((out_channels[0] + in_channels[1]) * width), | |||
int(out_channels[1] * width), | |||
round(3 * depth), | |||
False, | |||
depthwise=depthwise, | |||
act=act) | |||
# node x6: input x3, x4, x7 | |||
self.bu_conv46 = Conv( | |||
int(in_channels[1] * width), | |||
int(in_channels[1] * width), | |||
3, | |||
2, | |||
act=act) | |||
self.bu_conv76 = Conv( | |||
int(out_channels[1] * width), | |||
int(out_channels[1] * width), | |||
3, | |||
2, | |||
act=act) | |||
if reparam_mode: | |||
self.merge_6 = CSPStage( | |||
block_name, | |||
int((in_channels[1] + out_channels[1] + in_channels[2]) | |||
* width), | |||
int(out_channels[2] * width), | |||
round(3 * depth), | |||
act=act, | |||
spp=spp) | |||
else: | |||
self.merge_6 = CSPLayer( | |||
int((in_channels[1] + out_channels[1] + in_channels[2]) | |||
* width), | |||
int(out_channels[2] * width), | |||
round(3 * depth), | |||
False, | |||
depthwise=depthwise, | |||
act=act) | |||
def init_weights(self): | |||
pass | |||
def forward(self, out_features): | |||
""" | |||
Args: | |||
inputs: input images. | |||
Returns: | |||
Tuple[Tensor]: FPN feature. | |||
""" | |||
# backbone | |||
[x2, x1, x0] = out_features | |||
# node x3 | |||
x13 = self.bu_conv13(x1) | |||
x3 = torch.cat([x0, x13], 1) | |||
x3 = self.merge_3(x3) | |||
# node x4 | |||
x34 = self.upsample(x3) | |||
x24 = self.bu_conv24(x2) | |||
x4 = torch.cat([x1, x24, x34], 1) | |||
x4 = self.merge_4(x4) | |||
# node x5 | |||
x45 = self.upsample(x4) | |||
x5 = torch.cat([x2, x45], 1) | |||
x5 = self.merge_5(x5) | |||
# node x7 | |||
x57 = self.bu_conv57(x5) | |||
x7 = torch.cat([x4, x57], 1) | |||
x7 = self.merge_7(x7) | |||
# node x6 | |||
x46 = self.bu_conv46(x4) | |||
x76 = self.bu_conv76(x7) | |||
x6 = torch.cat([x3, x46, x76], 1) | |||
x6 = self.merge_6(x6) | |||
outputs = (x5, x7, x6) | |||
return outputs |
@@ -11,5 +11,5 @@ from .detector import SingleStageDetector | |||
class DamoYolo(SingleStageDetector): | |||
def __init__(self, model_dir, *args, **kwargs): | |||
self.config_name = 'damoyolo_s.py' | |||
self.config_name = 'damoyolo.py' | |||
super(DamoYolo, self).__init__(model_dir, *args, **kwargs) |
@@ -1,5 +1,5 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo. | |||
from modelscope.metainfo import Models | |||
from modelscope.models.builder import MODELS | |||
@@ -1,30 +1,33 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet. | |||
# The DAMO-YOLO implementation is also open-sourced by the authors, and available | |||
# at https://github.com/tinyvision/damo-yolo. | |||
import importlib | |||
import os | |||
import shutil | |||
import sys | |||
import tempfile | |||
from os.path import dirname, join | |||
from easydict import EasyDict | |||
def get_config_by_file(config_file): | |||
try: | |||
sys.path.append(os.path.dirname(config_file)) | |||
current_config = importlib.import_module( | |||
os.path.basename(config_file).split('.')[0]) | |||
exp = current_config.Config() | |||
except Exception: | |||
raise ImportError( | |||
"{} doesn't contains class named 'Config'".format(config_file)) | |||
return exp | |||
def parse_config(filename): | |||
filename = str(filename) | |||
if filename.endswith('.py'): | |||
with tempfile.TemporaryDirectory() as temp_config_dir: | |||
shutil.copyfile(filename, join(temp_config_dir, '_tempconfig.py')) | |||
sys.path.insert(0, temp_config_dir) | |||
mod = importlib.import_module('_tempconfig') | |||
sys.path.pop(0) | |||
cfg_dict = EasyDict({ | |||
name: value | |||
for name, value in mod.__dict__.items() | |||
if not name.startswith('__') | |||
}) | |||
# delete imported module | |||
del sys.modules['_tempconfig'] | |||
else: | |||
raise IOError('Only .py type are supported now!') | |||
def parse_config(config_file): | |||
""" | |||
get config object by file. | |||
Args: | |||
config_file (str): file path of config. | |||
""" | |||
assert (config_file is not None), 'plz provide config file' | |||
if config_file is not None: | |||
return get_config_by_file(config_file) | |||
return cfg_dict |
@@ -0,0 +1,21 @@ | |||
# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
from typing import TYPE_CHECKING | |||
from modelscope.utils.import_utils import LazyImportModule | |||
if TYPE_CHECKING: | |||
from .model import VideoMattingNetwork | |||
from .model import preprocess | |||
else: | |||
_import_structure = {'model': ['VideoMattingNetwork', 'preprocess']} | |||
import sys | |||
sys.modules[__name__] = LazyImportModule( | |||
__name__, | |||
globals()['__file__'], | |||
_import_structure, | |||
module_spec=__spec__, | |||
extra_objects={}, | |||
) |
@@ -0,0 +1,38 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os.path as osp | |||
from typing import Optional | |||
import numpy as np | |||
import torch | |||
import torchvision | |||
from torch.nn import functional as F | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base import Tensor, TorchModel | |||
from modelscope.models.builder import MODELS | |||
from modelscope.models.cv.video_human_matting.models import MattingNetwork | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
@MODELS.register_module( | |||
Tasks.video_human_matting, module_name=Models.video_human_matting) | |||
class VideoMattingNetwork(TorchModel): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
super().__init__(model_dir, *args, **kwargs) | |||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) | |||
params = torch.load(model_path, map_location='cpu') | |||
self.model = MattingNetwork() | |||
if 'model_state_dict' in params.keys(): | |||
params = params['model_state_dict'] | |||
self.model.load_state_dict(params, strict=True) | |||
self.model.eval() | |||
def preprocess(image): | |||
frame_np = np.float32(image) / 255.0 | |||
frame_np = frame_np.transpose(2, 0, 1) | |||
frame_tensor = torch.from_numpy(frame_np) | |||
image_tensor = frame_tensor[None, :, :, :] | |||
return image_tensor |
@@ -0,0 +1 @@ | |||
from .matting import MattingNetwork |
@@ -0,0 +1,330 @@ | |||
""" | |||
Part of the implementation is borrowed from paper RVM | |||
paper publicly available at <https://arxiv.org/abs/2108.11515/> | |||
""" | |||
from typing import Optional | |||
import torch | |||
from torch import Tensor, nn | |||
class hswish(nn.Module): | |||
def forward(self, x): | |||
return torch.nn.Hardswish(inplace=True)(x) | |||
class scSEblock(nn.Module): | |||
def __init__(self, out): | |||
super().__init__() | |||
self.conv1 = nn.Sequential( | |||
nn.Conv2d(out, int(out / 2), 3, 1, 1), | |||
nn.GroupNorm(out // 8, int(out / 2)), hswish()) | |||
self.conv2 = nn.Sequential( | |||
nn.Conv2d(int(out / 2), out, 1, 1, 0), | |||
nn.GroupNorm(out // 4, out), | |||
) | |||
self.avgpool = nn.AdaptiveAvgPool2d(1) | |||
def forward_single(self, x): | |||
b, c, _, _ = x.size() | |||
x2 = self.avgpool(x).view(b, c, 1, 1) | |||
x2 = self.conv1(x2) | |||
x2 = self.conv2(x2) | |||
x2 = torch.sigmoid(x2) | |||
out = x2 * x | |||
return out | |||
def forward_time(self, x): | |||
B, T, _, H, W = x.shape | |||
x = x.flatten(0, 1) | |||
out = self.forward_single(x) | |||
out = out.unflatten(0, (B, T)) | |||
return out | |||
def forward(self, x): | |||
if x.ndim == 5: | |||
return self.forward_time(x) | |||
else: | |||
return self.forward_single(x) | |||
class RecurrentDecoder(nn.Module): | |||
def __init__(self, feature_channels, decoder_channels): | |||
super().__init__() | |||
self.avgpool = AvgPool() | |||
self.decode4 = BottleneckBlock(feature_channels[3]) | |||
self.decode3 = UpsamplingBlock(feature_channels[3], | |||
feature_channels[2], 3, | |||
decoder_channels[0]) | |||
self.sc3 = scSEblock(decoder_channels[0]) | |||
self.decode2 = UpsamplingBlock(decoder_channels[0], | |||
feature_channels[1], 3, | |||
decoder_channels[1]) | |||
self.sc2 = scSEblock(decoder_channels[1]) | |||
self.decode1 = UpsamplingBlock(decoder_channels[1], | |||
feature_channels[0], 3, | |||
decoder_channels[2]) | |||
self.sc1 = scSEblock(decoder_channels[2]) | |||
self.out0 = OutputBlock(decoder_channels[2], 3, decoder_channels[3]) | |||
self.crosslevel1 = crossfeature(feature_channels[3], | |||
feature_channels[1]) | |||
self.crosslevel2 = crossfeature(feature_channels[2], | |||
feature_channels[0]) | |||
def forward(self, s0: Tensor, f1: Tensor, f2: Tensor, f3: Tensor, | |||
f4: Tensor, r1: Optional[Tensor], r2: Optional[Tensor], | |||
r3: Optional[Tensor], r4: Optional[Tensor]): | |||
s2, s3, s4 = self.avgpool(s0) | |||
x4, r4 = self.decode4(f4, r4) | |||
x3, r3 = self.decode3(x4, f3, s4, r3) | |||
x3 = self.sc3(x3) | |||
f2 = self.crosslevel1(f4, f2) | |||
x2, r2 = self.decode2(x3, f2, s3, r2) | |||
x2 = self.sc2(x2) | |||
f1 = self.crosslevel2(f3, f1) | |||
x1, r1 = self.decode1(x2, f1, s2, r1) | |||
x1 = self.sc1(x1) | |||
out = self.out0(x1, s0) | |||
return out, r1, r2, r3, r4 | |||
class AvgPool(nn.Module): | |||
def __init__(self): | |||
super().__init__() | |||
self.avgpool = nn.AvgPool2d( | |||
2, 2, count_include_pad=False, ceil_mode=True) | |||
def forward_single_frame(self, s0): | |||
s1 = self.avgpool(s0) | |||
s2 = self.avgpool(s1) | |||
s3 = self.avgpool(s2) | |||
return s1, s2, s3 | |||
def forward_time_series(self, s0): | |||
B, T = s0.shape[:2] | |||
s0 = s0.flatten(0, 1) | |||
s1, s2, s3 = self.forward_single_frame(s0) | |||
s1 = s1.unflatten(0, (B, T)) | |||
s2 = s2.unflatten(0, (B, T)) | |||
s3 = s3.unflatten(0, (B, T)) | |||
return s1, s2, s3 | |||
def forward(self, s0): | |||
if s0.ndim == 5: | |||
return self.forward_time_series(s0) | |||
else: | |||
return self.forward_single_frame(s0) | |||
class crossfeature(nn.Module): | |||
def __init__(self, in_channels, out_channels): | |||
super().__init__() | |||
self.avg = nn.AdaptiveAvgPool2d(1) | |||
self.conv = nn.Conv2d(in_channels, out_channels, 1, 1, 0, bias=False) | |||
def forward_single_frame(self, x1, x2): | |||
b, c, _, _ = x1.size() | |||
x1 = self.avg(x1).view(b, c, 1, 1) | |||
x1 = self.conv(x1) | |||
x1 = torch.sigmoid(x1) | |||
x2 = x1 * x2 | |||
return x2 | |||
def forward_time_series(self, x1, x2): | |||
b, t = x1.shape[:2] | |||
x1 = x1.flatten(0, 1) | |||
x2 = x2.flatten(0, 1) | |||
x2 = self.forward_single_frame(x1, x2) | |||
return x2.unflatten(0, (b, t)) | |||
def forward(self, x1, x2): | |||
if x1.ndim == 5: | |||
return self.forward_time_series(x1, x2) | |||
else: | |||
return self.forward_single_frame(x1, x2) | |||
class BottleneckBlock(nn.Module): | |||
def __init__(self, channels): | |||
super().__init__() | |||
self.channels = channels | |||
self.gru = GRU(channels // 2) | |||
def forward(self, x, r): | |||
a, b = x.split(self.channels // 2, dim=-3) | |||
b, r = self.gru(b, r) | |||
x = torch.cat([a, b], dim=-3) | |||
return x, r | |||
class UpsamplingBlock(nn.Module): | |||
def __init__(self, in_channels, skip_channels, src_channels, out_channels): | |||
super().__init__() | |||
self.out_channels = out_channels | |||
self.upsample = nn.Upsample( | |||
scale_factor=2, mode='bilinear', align_corners=False) | |||
self.shortcut = nn.Sequential( | |||
nn.Conv2d(skip_channels, in_channels, 3, 1, 1, bias=False), | |||
nn.GroupNorm(in_channels // 4, in_channels), hswish()) | |||
self.att_skip = nn.Sequential( | |||
nn.Conv2d(in_channels, in_channels, 1, 1, 0, bias=False), | |||
nn.Sigmoid()) | |||
self.conv = nn.Sequential( | |||
nn.Conv2d( | |||
in_channels + in_channels + src_channels, | |||
out_channels, | |||
3, | |||
1, | |||
1, | |||
bias=False), | |||
nn.GroupNorm(out_channels // 4, out_channels), | |||
hswish(), | |||
) | |||
self.gru = GRU(out_channels // 2) | |||
def forward_single_frame(self, x, f, s, r: Optional[Tensor]): | |||
x = self.upsample(x) | |||
x = x[:, :, :s.size(2), :s.size(3)] | |||
att = self.att_skip(x) | |||
f = self.shortcut(f) | |||
f = att * f | |||
x = torch.cat([x, f, s], dim=1) | |||
x = self.conv(x) | |||
a, b = x.split(self.out_channels // 2, dim=1) | |||
b, r = self.gru(b, r) | |||
x = torch.cat([a, b], dim=1) | |||
return x, r | |||
def forward_time_series(self, x, f, s, r: Optional[Tensor]): | |||
B, T, _, H, W = s.shape | |||
x = x.flatten(0, 1) | |||
f = f.flatten(0, 1) | |||
s = s.flatten(0, 1) | |||
x = self.upsample(x) | |||
x = x[:, :, :H, :W] | |||
f = self.shortcut(f) | |||
att = self.att_skip(x) | |||
f = att * f | |||
x = torch.cat([x, f, s], dim=1) | |||
x = self.conv(x) | |||
x = x.unflatten(0, (B, T)) | |||
a, b = x.split(self.out_channels // 2, dim=2) | |||
b, r = self.gru(b, r) | |||
x = torch.cat([a, b], dim=2) | |||
return x, r | |||
def forward(self, x, f, s, r: Optional[Tensor]): | |||
if x.ndim == 5: | |||
return self.forward_time_series(x, f, s, r) | |||
else: | |||
return self.forward_single_frame(x, f, s, r) | |||
class OutputBlock(nn.Module): | |||
def __init__(self, in_channels, src_channels, out_channels): | |||
super().__init__() | |||
self.upsample = nn.Upsample( | |||
scale_factor=2, mode='bilinear', align_corners=False) | |||
self.conv = nn.Sequential( | |||
nn.Conv2d( | |||
in_channels + src_channels, out_channels, 3, 1, 1, bias=False), | |||
nn.GroupNorm(out_channels // 2, out_channels), | |||
hswish(), | |||
nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False), | |||
nn.GroupNorm(out_channels // 2, out_channels), | |||
hswish(), | |||
) | |||
def forward_single_frame(self, x, s): | |||
x = self.upsample(x) | |||
x = x[:, :, :s.size(2), :s.size(3)] | |||
x = torch.cat([x, s], dim=1) | |||
x = self.conv(x) | |||
return x | |||
def forward_time_series(self, x, s): | |||
B, T, _, H, W = s.shape | |||
x = x.flatten(0, 1) | |||
s = s.flatten(0, 1) | |||
x = self.upsample(x) | |||
x = x[:, :, :H, :W] | |||
x = torch.cat([x, s], dim=1) | |||
x = self.conv(x) | |||
x = x.unflatten(0, (B, T)) | |||
return x | |||
def forward(self, x, s): | |||
if x.ndim == 5: | |||
return self.forward_time_series(x, s) | |||
else: | |||
return self.forward_single_frame(x, s) | |||
class Projection(nn.Module): | |||
def __init__(self, in_channels, out_channels): | |||
super().__init__() | |||
self.conv = nn.Conv2d(in_channels, out_channels, 1) | |||
def forward_single_frame(self, x): | |||
return self.conv(x) | |||
def forward_time_series(self, x): | |||
B, T = x.shape[:2] | |||
return self.conv(x.flatten(0, 1)).unflatten(0, (B, T)) | |||
def forward(self, x): | |||
if x.ndim == 5: | |||
return self.forward_time_series(x) | |||
else: | |||
return self.forward_single_frame(x) | |||
class GRU(nn.Module): | |||
def __init__(self, channels, kernel_size=3, padding=1): | |||
super().__init__() | |||
self.channels = channels | |||
self.ih = nn.Conv2d( | |||
channels * 2, channels * 2, kernel_size, padding=padding) | |||
self.act_ih = nn.Sigmoid() | |||
self.hh = nn.Conv2d( | |||
channels * 2, channels, kernel_size, padding=padding) | |||
self.act_hh = nn.Tanh() | |||
def forward_single_frame(self, x, pre_fea): | |||
fea_ih = self.ih(torch.cat([x, pre_fea], dim=1)) | |||
r, z = self.act_ih(fea_ih).split(self.channels, dim=1) | |||
fea_hh = self.hh(torch.cat([x, r * pre_fea], dim=1)) | |||
c = self.act_hh(fea_hh) | |||
fea_gru = (1 - z) * pre_fea + z * c | |||
return fea_gru, fea_gru | |||
def forward_time_series(self, x, pre_fea): | |||
o = [] | |||
for xt in x.unbind(dim=1): | |||
ot, pre_fea = self.forward_single_frame(xt, pre_fea) | |||
o.append(ot) | |||
o = torch.stack(o, dim=1) | |||
return o, pre_fea | |||
def forward(self, x, pre_fea): | |||
if pre_fea is None: | |||
pre_fea = torch.zeros( | |||
(x.size(0), x.size(-3), x.size(-2), x.size(-1)), | |||
device=x.device, | |||
dtype=x.dtype) | |||
if x.ndim == 5: | |||
return self.forward_time_series(x, pre_fea) | |||
else: | |||
return self.forward_single_frame(x, pre_fea) |
@@ -0,0 +1,64 @@ | |||
""" | |||
Part of the implementation is borrowed and modified from DeepGuidedFilter | |||
publicly available at <https://github.com/wuhuikai/DeepGuidedFilter/> | |||
""" | |||
import torch | |||
from torch import nn | |||
from torch.nn import functional as F | |||
class DeepGuidedFilterRefiner(nn.Module): | |||
def __init__(self, hid_channels=16): | |||
super().__init__() | |||
self.box_filter = nn.Conv2d( | |||
4, 4, kernel_size=3, padding=1, bias=False, groups=4) | |||
self.box_filter.weight.data[...] = 1 / 9 | |||
self.conv = nn.Sequential( | |||
nn.Conv2d( | |||
4 * 2 + hid_channels, hid_channels, kernel_size=1, bias=False), | |||
nn.BatchNorm2d(hid_channels), nn.ReLU(True), | |||
nn.Conv2d(hid_channels, hid_channels, kernel_size=1, bias=False), | |||
nn.BatchNorm2d(hid_channels), nn.ReLU(True), | |||
nn.Conv2d(hid_channels, 4, kernel_size=1, bias=True)) | |||
def forward_single_frame(self, fine_src, base_src, base_fgr, base_pha, | |||
base_hid): | |||
fine_x = torch.cat([fine_src, fine_src.mean(1, keepdim=True)], dim=1) | |||
base_x = torch.cat([base_src, base_src.mean(1, keepdim=True)], dim=1) | |||
base_y = torch.cat([base_fgr, base_pha], dim=1) | |||
mean_x = self.box_filter(base_x) | |||
mean_y = self.box_filter(base_y) | |||
cov_xy = self.box_filter(base_x * base_y) - mean_x * mean_y | |||
var_x = self.box_filter(base_x * base_x) - mean_x * mean_x | |||
A = self.conv(torch.cat([cov_xy, var_x, base_hid], dim=1)) | |||
b = mean_y - A * mean_x | |||
H, W = fine_src.shape[2:] | |||
A = F.interpolate(A, (H, W), mode='bilinear', align_corners=False) | |||
b = F.interpolate(b, (H, W), mode='bilinear', align_corners=False) | |||
out = A * fine_x + b | |||
fgr, pha = out.split([3, 1], dim=1) | |||
return fgr, pha | |||
def forward_time_series(self, fine_src, base_src, base_fgr, base_pha, | |||
base_hid): | |||
B, T = fine_src.shape[:2] | |||
fgr, pha = self.forward_single_frame( | |||
fine_src.flatten(0, 1), base_src.flatten(0, 1), | |||
base_fgr.flatten(0, 1), base_pha.flatten(0, 1), | |||
base_hid.flatten(0, 1)) | |||
fgr = fgr.unflatten(0, (B, T)) | |||
pha = pha.unflatten(0, (B, T)) | |||
return fgr, pha | |||
def forward(self, fine_src, base_src, base_fgr, base_pha, base_hid): | |||
if fine_src.ndim == 5: | |||
return self.forward_time_series(fine_src, base_src, base_fgr, | |||
base_pha, base_hid) | |||
else: | |||
return self.forward_single_frame(fine_src, base_src, base_fgr, | |||
base_pha, base_hid) |
@@ -0,0 +1,177 @@ | |||
""" | |||
Part of the implementation is borrowed and modified from EfficientNetV2 | |||
publicly available at <https://arxiv.org/abs/2104.00298> | |||
""" | |||
import torch | |||
import torch.nn.functional | |||
class SiLU(torch.nn.Module): | |||
""" | |||
[https://arxiv.org/pdf/1710.05941.pdf] | |||
""" | |||
def __init__(self, inplace: bool = False): | |||
super().__init__() | |||
self.silu = torch.nn.SiLU(inplace=inplace) | |||
def forward(self, x): | |||
return self.silu(x) | |||
class Conv(torch.nn.Module): | |||
def __init__(self, in_ch, out_ch, activation, k=1, s=1, g=1): | |||
super().__init__() | |||
self.conv = torch.nn.Conv2d( | |||
in_ch, out_ch, k, s, k // 2, 1, g, bias=False) | |||
self.norm = torch.nn.BatchNorm2d(out_ch, 0.001, 0.01) | |||
self.silu = activation | |||
def forward(self, x): | |||
return self.silu(self.norm(self.conv(x))) | |||
class SE(torch.nn.Module): | |||
""" | |||
[https://arxiv.org/pdf/1709.01507.pdf] | |||
""" | |||
def __init__(self, ch, r): | |||
super().__init__() | |||
self.se = torch.nn.Sequential( | |||
torch.nn.Conv2d(ch, ch // (4 * r), 1), torch.nn.SiLU(), | |||
torch.nn.Conv2d(ch // (4 * r), ch, 1), torch.nn.Sigmoid()) | |||
def forward(self, x): | |||
return x * self.se(x.mean((2, 3), keepdim=True)) | |||
class Residual(torch.nn.Module): | |||
""" | |||
[https://arxiv.org/pdf/1801.04381.pdf] | |||
""" | |||
def __init__(self, in_ch, out_ch, s, r, fused=True): | |||
super().__init__() | |||
identity = torch.nn.Identity() | |||
if fused: | |||
if r == 1: | |||
features = [Conv(in_ch, r * in_ch, torch.nn.SiLU(), 3, s)] | |||
else: | |||
features = [ | |||
Conv(in_ch, r * in_ch, torch.nn.SiLU(), 3, s), | |||
Conv(r * in_ch, out_ch, identity) | |||
] | |||
else: | |||
if r == 1: | |||
features = [ | |||
Conv(r * in_ch, r * in_ch, torch.nn.SiLU(), 3, s, | |||
r * in_ch), | |||
SE(r * in_ch, r), | |||
Conv(r * in_ch, out_ch, identity) | |||
] | |||
else: | |||
features = [ | |||
Conv(in_ch, r * in_ch, torch.nn.SiLU()), | |||
Conv(r * in_ch, r * in_ch, torch.nn.SiLU(), 3, s, | |||
r * in_ch), | |||
SE(r * in_ch, r), | |||
Conv(r * in_ch, out_ch, identity) | |||
] | |||
self.add = s == 1 and in_ch == out_ch | |||
self.res = torch.nn.Sequential(*features) | |||
def forward(self, x): | |||
return x + self.res(x) if self.add else self.res(x) | |||
class EfficientNet(torch.nn.Module): | |||
def __init__(self, pretrained: bool = False): | |||
super().__init__() | |||
gate_fn = [True, False] | |||
filters = [24, 48, 64, 128, 160, 256] | |||
feature = [Conv(3, filters[0], torch.nn.SiLU(), 3, 2)] | |||
for i in range(2): | |||
if i == 0: | |||
feature.append( | |||
Residual(filters[0], filters[0], 1, 1, gate_fn[0])) | |||
else: | |||
feature.append( | |||
Residual(filters[0], filters[0], 1, 1, gate_fn[0])) | |||
for i in range(4): | |||
if i == 0: | |||
feature.append( | |||
Residual(filters[0], filters[1], 2, 4, gate_fn[0])) | |||
else: | |||
feature.append( | |||
Residual(filters[1], filters[1], 1, 4, gate_fn[0])) | |||
for i in range(4): | |||
if i == 0: | |||
feature.append( | |||
Residual(filters[1], filters[2], 2, 4, gate_fn[0])) | |||
else: | |||
feature.append( | |||
Residual(filters[2], filters[2], 1, 4, gate_fn[0])) | |||
for i in range(6): | |||
if i == 0: | |||
feature.append( | |||
Residual(filters[2], filters[3], 2, 4, gate_fn[1])) | |||
else: | |||
feature.append( | |||
Residual(filters[3], filters[3], 1, 4, gate_fn[1])) | |||
for i in range(9): | |||
if i == 0: | |||
feature.append( | |||
Residual(filters[3], filters[4], 1, 6, gate_fn[1])) | |||
else: | |||
feature.append( | |||
Residual(filters[4], filters[4], 1, 6, gate_fn[1])) | |||
self.feature = torch.nn.Sequential(*feature) | |||
def forward_single_frame(self, x): | |||
x = self.feature[0](x) | |||
x = self.feature[1](x) | |||
x = self.feature[2](x) | |||
f1 = x # 1/2 24 | |||
for i in range(4): | |||
x = self.feature[i + 3](x) | |||
f2 = x # 1/4 48 | |||
for i in range(4): | |||
x = self.feature[i + 7](x) | |||
f3 = x # 1/8 64 | |||
for i in range(6): | |||
x = self.feature[i + 11](x) | |||
for i in range(9): | |||
x = self.feature[i + 17](x) | |||
f5 = x # 1/16 160 | |||
return [f1, f2, f3, f5] | |||
def forward_time_series(self, x): | |||
B, T = x.shape[:2] | |||
features = self.forward_single_frame(x.flatten(0, 1)) | |||
features = [f.unflatten(0, (B, T)) for f in features] | |||
return features | |||
def forward(self, x): | |||
if x.ndim == 5: | |||
return self.forward_time_series(x) | |||
else: | |||
return self.forward_single_frame(x) | |||
def export(self): | |||
for m in self.modules(): | |||
if type(m) is Conv and hasattr(m, 'silu'): | |||
if isinstance(m.silu, torch.nn.SiLU): | |||
m.silu = SiLU() | |||
if type(m) is SE: | |||
if isinstance(m.se[1], torch.nn.SiLU): | |||
m.se[1] = SiLU() | |||
return self |
@@ -0,0 +1,94 @@ | |||
""" | |||
Part of the implementation is borrowed and modified from Deeplab v3 | |||
publicly available at <https://arxiv.org/abs/1706.05587v3> | |||
""" | |||
import torch | |||
from torch import nn | |||
class ASP_OC_Module(nn.Module): | |||
def __init__(self, features, out_features=96, dilations=(2, 4, 8)): | |||
super(ASP_OC_Module, self).__init__() | |||
self.conv2 = nn.Sequential( | |||
nn.Conv2d( | |||
features, | |||
out_features, | |||
kernel_size=1, | |||
padding=0, | |||
dilation=1, | |||
bias=False), nn.BatchNorm2d(out_features)) | |||
self.conv3 = nn.Sequential( | |||
nn.Conv2d( | |||
features, | |||
out_features, | |||
kernel_size=3, | |||
padding=dilations[0], | |||
dilation=dilations[0], | |||
bias=False), nn.BatchNorm2d(out_features)) | |||
self.conv4 = nn.Sequential( | |||
nn.Conv2d( | |||
features, | |||
out_features, | |||
kernel_size=3, | |||
padding=dilations[1], | |||
dilation=dilations[1], | |||
bias=False), nn.BatchNorm2d(out_features)) | |||
self.conv5 = nn.Sequential( | |||
nn.Conv2d( | |||
features, | |||
out_features, | |||
kernel_size=3, | |||
padding=dilations[2], | |||
dilation=dilations[2], | |||
bias=False), nn.BatchNorm2d(out_features)) | |||
self.conv_bn_dropout = nn.Sequential( | |||
nn.Conv2d( | |||
out_features * 4, | |||
out_features * 2, | |||
kernel_size=1, | |||
padding=0, | |||
dilation=1, | |||
bias=False), nn.InstanceNorm2d(out_features * 2), | |||
nn.Dropout2d(0.05)) | |||
def _cat_each(self, feat1, feat2, feat3, feat4, feat5): | |||
assert (len(feat1) == len(feat2)) | |||
z = [] | |||
for i in range(len(feat1)): | |||
z.append( | |||
torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), | |||
1)) | |||
return z | |||
def forward(self, x): | |||
_, _, h, w = x.size() | |||
feat2 = self.conv2(x) | |||
feat3 = self.conv3(x) | |||
feat4 = self.conv4(x) | |||
feat5 = self.conv5(x) | |||
out = torch.cat((feat2, feat3, feat4, feat5), 1) | |||
output = self.conv_bn_dropout(out) | |||
return output | |||
class LRASPP(nn.Module): | |||
def __init__(self, in_channels, out_channels): | |||
super().__init__() | |||
self.aspp = ASP_OC_Module(in_channels, out_channels) | |||
def forward_single_frame(self, x): | |||
return self.aspp(x) | |||
def forward_time_series(self, x): | |||
B, T = x.shape[:2] | |||
x = self.forward_single_frame(x.flatten(0, 1)).unflatten(0, (B, T)) | |||
return x | |||
def forward(self, x): | |||
if x.ndim == 5: | |||
return self.forward_time_series(x) | |||
else: | |||
return self.forward_single_frame(x) |
@@ -0,0 +1,67 @@ | |||
from typing import Optional | |||
import torch | |||
from torch import Tensor | |||
from torch.nn import functional as F | |||
from .decoder import Projection, RecurrentDecoder | |||
from .deep_guided_filter import DeepGuidedFilterRefiner | |||
from .effv2 import EfficientNet | |||
from .lraspp import LRASPP | |||
class MattingNetwork(torch.nn.Module): | |||
def __init__(self, pretrained_backbone: bool = False): | |||
super().__init__() | |||
self.backbone = EfficientNet(pretrained_backbone) | |||
self.aspp = LRASPP(160, 64) | |||
self.decoder = RecurrentDecoder([24, 48, 64, 128], [64, 32, 24, 16]) | |||
self.project_mat = Projection(16, 4) | |||
self.project_seg = Projection(16, 1) | |||
self.refiner = DeepGuidedFilterRefiner() | |||
def forward(self, | |||
src: Tensor, | |||
r0: Optional[Tensor] = None, | |||
r1: Optional[Tensor] = None, | |||
r2: Optional[Tensor] = None, | |||
r3: Optional[Tensor] = None, | |||
downsample_ratio: float = 1, | |||
segmentation_pass: bool = False): | |||
if downsample_ratio != 1: | |||
src_sm = self._interpolate(src, scale_factor=downsample_ratio) | |||
else: | |||
src_sm = src | |||
f1, f2, f3, f4 = self.backbone(src_sm) | |||
f4 = self.aspp(f4) | |||
hid, *rec = self.decoder(src_sm, f1, f2, f3, f4, r0, r1, r2, r3) | |||
if not segmentation_pass: | |||
fgr_residual, pha = self.project_mat(hid).split([3, 1], dim=-3) | |||
if downsample_ratio != 1: | |||
_, pha = self.refiner(src, src_sm, fgr_residual, pha, hid) | |||
pha = pha.clamp(0., 1.) | |||
return [pha, *rec] | |||
else: | |||
seg = self.project_seg(hid) | |||
return [seg, *rec] | |||
def _interpolate(self, x: Tensor, scale_factor: float): | |||
if x.ndim == 5: | |||
B, T = x.shape[:2] | |||
x = F.interpolate( | |||
x.flatten(0, 1), | |||
scale_factor=scale_factor, | |||
mode='bilinear', | |||
align_corners=False) | |||
x = x.unflatten(0, (B, T)) | |||
else: | |||
x = F.interpolate( | |||
x, | |||
scale_factor=scale_factor, | |||
mode='bilinear', | |||
align_corners=False) | |||
return x |
@@ -509,8 +509,8 @@ def convert_weights(model: nn.Module): | |||
@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) | |||
class CLIPForMultiModalEmbedding(TorchModel): | |||
def __init__(self, model_dir, device_id=-1): | |||
super().__init__(model_dir=model_dir, device_id=device_id) | |||
def __init__(self, model_dir, *args, **kwargs): | |||
super().__init__(model_dir=model_dir, *args, **kwargs) | |||
# Initialize the model. | |||
vision_model_config_file = '{}/vision_model_config.json'.format( | |||