Browse Source

space intent and modeling(generation) are ready

master
ly119399 3 years ago
parent
commit
d0b33eade8
74 changed files with 3689 additions and 267 deletions
  1. +3
    -0
      .gitattributes
  2. +0
    -1
      .gitignore
  3. +67
    -0
      Makefile.docker
  4. +0
    -0
      configs/examples/configuration.json
  5. +0
    -0
      configs/examples/configuration.py
  6. +0
    -0
      configs/examples/configuration.yaml
  7. +3
    -0
      data/test/images/image1.jpg
  8. +3
    -0
      data/test/images/image_matting.png
  9. +4
    -0
      docker/.dockerignore
  10. +53
    -0
      docker/pytorch.dockerfile
  11. +2
    -0
      docker/rcfiles/pip.conf.tsinghua
  12. +25
    -0
      docker/rcfiles/sources.list.aliyun
  13. +10
    -0
      docker/rcfiles/user.vimrc
  14. +12
    -0
      docker/scripts/install_libs.sh
  15. +1
    -1
      docs/source/conf.py
  16. +120
    -3
      docs/source/develop.md
  17. +1
    -1
      modelscope/models/__init__.py
  18. +0
    -0
      modelscope/models/audio/__init__.py
  19. +0
    -0
      modelscope/models/audio/layers/__init__.py
  20. +60
    -0
      modelscope/models/audio/layers/activations.py
  21. +78
    -0
      modelscope/models/audio/layers/affine_transform.py
  22. +178
    -0
      modelscope/models/audio/layers/deep_fsmn.py
  23. +50
    -0
      modelscope/models/audio/layers/layer_base.py
  24. +482
    -0
      modelscope/models/audio/layers/uni_deep_fsmn.py
  25. +0
    -0
      modelscope/models/audio/network/__init__.py
  26. +394
    -0
      modelscope/models/audio/network/loss.py
  27. +248
    -0
      modelscope/models/audio/network/modulation_loss.py
  28. +483
    -0
      modelscope/models/audio/network/se_net.py
  29. +17
    -9
      modelscope/models/base.py
  30. +4
    -2
      modelscope/models/nlp/__init__.py
  31. +17
    -0
      modelscope/models/nlp/bert_for_sequence_classification.py
  32. +43
    -0
      modelscope/models/nlp/palm_for_text_generation.py
  33. +88
    -0
      modelscope/models/nlp/sbert_for_sentence_similarity.py
  34. +56
    -0
      modelscope/models/nlp/sbert_for_token_classification.py
  35. +0
    -52
      modelscope/models/nlp/text_generation_model.py
  36. +1
    -1
      modelscope/pipelines/__init__.py
  37. +1
    -0
      modelscope/pipelines/audio/__init__.py
  38. +160
    -0
      modelscope/pipelines/audio/linear_aec_pipeline.py
  39. +29
    -1
      modelscope/pipelines/base.py
  40. +10
    -7
      modelscope/pipelines/builder.py
  41. +3
    -3
      modelscope/pipelines/cv/image_matting_pipeline.py
  42. +1
    -1
      modelscope/pipelines/multi_modal/__init__.py
  43. +5
    -2
      modelscope/pipelines/multi_modal/image_caption_pipeline.py
  44. +2
    -0
      modelscope/pipelines/nlp/__init__.py
  45. +62
    -0
      modelscope/pipelines/nlp/sentence_similarity_pipeline.py
  46. +16
    -40
      modelscope/pipelines/nlp/sequence_classification_pipeline.py
  47. +23
    -20
      modelscope/pipelines/nlp/text_generation_pipeline.py
  48. +69
    -0
      modelscope/pipelines/nlp/word_segmentation_pipeline.py
  49. +117
    -0
      modelscope/pipelines/outputs.py
  50. +21
    -17
      modelscope/pipelines/util.py
  51. +1
    -1
      modelscope/preprocessors/__init__.py
  52. +230
    -0
      modelscope/preprocessors/audio.py
  53. +1
    -1
      modelscope/preprocessors/image.py
  54. +82
    -16
      modelscope/preprocessors/nlp.py
  55. +6
    -6
      modelscope/utils/config.py
  56. +15
    -13
      modelscope/utils/constant.py
  57. +1
    -1
      modelscope/utils/registry.py
  58. +20
    -0
      modelscope/utils/test_utils.py
  59. +1
    -0
      requirements/docs.txt
  60. +1
    -1
      requirements/nlp.txt
  61. +3
    -2
      requirements/runtime.txt
  62. +2
    -1
      setup.cfg
  63. +8
    -11
      tests/pipelines/test_base.py
  64. +3
    -4
      tests/pipelines/test_image_captioning.py
  65. +11
    -15
      tests/pipelines/test_image_matting.py
  66. +3
    -0
      tests/pipelines/test_person_image_cartoon.py
  67. +67
    -0
      tests/pipelines/test_sentence_similarity.py
  68. +56
    -0
      tests/pipelines/test_speech_signal_process.py
  69. +6
    -0
      tests/pipelines/test_text_classification.py
  70. +54
    -26
      tests/pipelines/test_text_generation.py
  71. +62
    -0
      tests/pipelines/test_word_segmentation.py
  72. +20
    -0
      tests/preprocessors/test_image.py
  73. +9
    -0
      tests/run.py
  74. +5
    -8
      tests/utils/test_config.py

+ 3
- 0
.gitattributes View File

@@ -0,0 +1,3 @@
*.png filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text

+ 0
- 1
.gitignore View File

@@ -104,7 +104,6 @@ venv.bak/
# mypy
.mypy_cache/

data
.vscode
.idea



+ 67
- 0
Makefile.docker View File

@@ -0,0 +1,67 @@
DOCKER_REGISTRY = registry.cn-shanghai.aliyuncs.com
DOCKER_ORG = modelscope
DOCKER_IMAGE = modelscope
DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)

# CUDA_VERSION = 11.3
# CUDNN_VERSION = 8
BASE_RUNTIME = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
# BASE_DEVEL = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
BASE_DEVEL = pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel


MODELSCOPE_VERSION = $(shell git describe --tags --always)

# Can be either official / dev
BUILD_TYPE = dev
BUILD_PROGRESS = auto
BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE)

EXTRA_DOCKER_BUILD_FLAGS ?= --network=host
# DOCKER_BUILD = DOCKER_BUILDKIT=1 \
# docker build \
# --progress=$(BUILD_PROGRESS) \
# $(EXTRA_DOCKER_BUILD_FLAGS) \
# --target $(BUILD_TYPE) \
# -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
# $(BUILD_ARGS) \
# -f docker/pytorch.dockerfile .
DOCKER_BUILD = DOCKER_BUILDKIT=1 \
docker build \
$(EXTRA_DOCKER_BUILD_FLAGS) \
-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
$(BUILD_ARGS) \
-f docker/pytorch.dockerfile .
DOCKER_PUSH = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)

.PHONY: all
all: devel-image

.PHONY: devel-image
devel-image: BASE_IMAGE := $(BASE_DEVEL)
devel-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
devel-image:
$(DOCKER_BUILD)

.PHONY: devel-push
devel-push: BASE_IMAGE := $(BASE_DEVEL)
devel-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
devel-push:
$(DOCKER_PUSH)

.PHONY: runtime-image
runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
runtime-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
runtime-image:
$(DOCKER_BUILD)
docker tag $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(DOCKER_FULL_NAME):latest

.PHONY: runtime-push
runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
runtime-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
runtime-push:
$(DOCKER_PUSH)

.PHONY: clean
clean:
-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))

configs/examples/config.json → configs/examples/configuration.json View File


configs/examples/config.py → configs/examples/configuration.py View File


configs/examples/config.yaml → configs/examples/configuration.yaml View File


+ 3
- 0
data/test/images/image1.jpg View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
size 129862

+ 3
- 0
data/test/images/image_matting.png View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
size 603621

+ 4
- 0
docker/.dockerignore View File

@@ -0,0 +1,4 @@
*.sh
*.md
*.dockerfile
*.zip

+ 53
- 0
docker/pytorch.dockerfile View File

@@ -0,0 +1,53 @@
# syntax = docker/dockerfile:experimental
#
# NOTE: To build this you will need a docker version > 18.06 with
# experimental enabled and DOCKER_BUILDKIT=1
#
# If you do not use buildkit you are not going to have a good time
#
# For reference:
# https://docs.docker.com/develop/develop-images/build_enhancements/

# ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
# FROM ${BASE_IMAGE} as dev-base

# FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base
FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
# FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
# config pip source
RUN mkdir /root/.pip
COPY docker/rcfiles/pip.conf.tsinghua /root/.pip/pip.conf
COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list

# Install essential Ubuntu packages
RUN apt-get update &&\
apt-get install -y software-properties-common \
build-essential \
git \
wget \
vim \
curl \
zip \
zlib1g-dev \
unzip \
pkg-config

# install modelscope and its python env
WORKDIR /opt/modelscope
COPY . .
RUN pip install -r requirements.txt
# RUN --mount=type=cache,target=/opt/ccache \
# python setup.py install

# opencv-python-headless conflict with opencv-python installed
RUN python setup.py install \
&& pip uninstall -y opencv-python-headless

# prepare modelscope libs
COPY docker/scripts/install_libs.sh /tmp/
RUN bash /tmp/install_libs.sh && \
rm -rf /tmp/install_libs.sh

ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64

WORKDIR /workspace

+ 2
- 0
docker/rcfiles/pip.conf.tsinghua View File

@@ -0,0 +1,2 @@
[global]
index-url=https://pypi.tuna.tsinghua.edu.cn/simple

+ 25
- 0
docker/rcfiles/sources.list.aliyun View File

@@ -0,0 +1,25 @@
deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted

deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted

deb http://mirrors.aliyun.com/ubuntu/ bionic universe
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic universe
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates universe

deb http://mirrors.aliyun.com/ubuntu/ bionic multiverse
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse

deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
# deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse

deb http://mirrors.aliyun.com/ubuntu bionic-security main restricted
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security main restricted
deb http://mirrors.aliyun.com/ubuntu bionic-security universe
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security universe
deb http://mirrors.aliyun.com/ubuntu bionic-security multiverse
# deb-src http://mirrors.aliyun.com/ubuntu bionic-security multiverse

+ 10
- 0
docker/rcfiles/user.vimrc View File

@@ -0,0 +1,10 @@
set nocompatible
set encoding=utf-8
set hlsearch
set smartindent
set ruler
set number
set ts=2
set sw=2
set expandtab
autocmd FileType make setlocal noexpandtab

+ 12
- 0
docker/scripts/install_libs.sh View File

@@ -0,0 +1,12 @@
#!/bin/bash

set -eo pipefail

ModelScopeLib=/usr/local/modelscope/lib64

if [ ! -d /usr/local/modelscope ]; then
mkdir -p $ModelScopeLib
fi

# audio libs
wget "http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/libs/audio/libmitaec_pyio.so" -O ${ModelScopeLib}/libmitaec_pyio.so

+ 1
- 1
docs/source/conf.py View File

@@ -76,7 +76,7 @@ exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme = 'sphinx_book_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
html_theme_options = {}



+ 120
- 3
docs/source/develop.md View File

@@ -34,13 +34,111 @@ make linter
```

## 2. Test
### 2.1 Unit test

### 2.1 Test level

There are mainly three test levels:

* level 0: tests for basic interface and function of framework, such as `tests/trainers/test_trainer_base.py`
* level 1: important functional test which test end2end workflow, such as `tests/pipelines/test_image_matting.py`
* level 2: scenario tests for all the implemented modules such as model, pipeline in different algorithm filed.

Default test level is 0, which will only run those cases of level 0, you can set test level
via environment variable `TEST_LEVEL`. For more details, you can refer to [test-doc](https://alidocs.dingtalk.com/i/nodes/mdvQnONayjBJKLXy1Bp38PY2MeXzp5o0?dontjump=true&nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA)


```bash
# run all tests
TEST_LEVEL=2 make test

# run important functional tests
TEST_LEVEL=1 make test

# run core UT and basic functional tests
make test
```

### 2.2 Test data
TODO
When writing test cases, you should assign a test level for your test case using
following code. If left default, the test level will be 0, it will run in each
test stage.

File test_module.py
```python
from modelscope.utils.test_utils import test_level

class ImageCartoonTest(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_by_direct_model_download(self):
pass
```

### 2.2 Run tests

1. Run your own single test case to test your self-implemented function. You can run your
test file directly, if it fails to run, pls check if variable `TEST_LEVEL`
exists in the environment and unset it.
```bash
python tests/path/to/your_test.py
```

2. Remember to run core tests in local environment before start a codereview, by default it will
only run test cases with level 0.
```bash
make tests
```

3. After you start a code review, ci tests will be triggered which will run test cases with level 1

4. Daily regression tests will run all cases at 0 am each day using master branch.

### 2.3 Test data storage

As we need a lot of data for testing, including images, videos, models. We use git lfs
to store those large files.

1. install git-lfs
for mac
```bash
brew install git-lfs
git lfs install
```

for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
```bash
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
git lfs install
```

for ubuntu
```bash
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt-get install git-lfs
git lfs install
```

2. track your data type using git lfs, for example, to track png files
```bash
git lfs track "*.png"
```

3. add your test files to `data/test/` folder, you can make directories if you need.
```bash
git add data/test/test.png
```

4. commit your test data to remote branch
```bash
git commit -m "xxx"
```

To pull data from remote repo, just as the same way you pull git files.
```bash
git pull origin branch_name
```




## Code Review

@@ -93,3 +191,22 @@ TODO
```bash
make whl
```

## Build docker

build develop docker
```bash
sudo make -f Makefile.docker devel-image
```

push develop docker, passwd pls ask wenmeng.zwm
```bash
sudo docker login --username=mass_test@test.aliyunid.com registry.cn-shanghai.aliyuncs.com
Password:
sudo make -f Makefile.docker devel-push
```

To build runtime image, just replace `devel` with `runtime` in the upper commands.
```bash
udo make -f Makefile.docker runtime-image runtime-push
```

+ 1
- 1
modelscope/models/__init__.py View File

@@ -2,4 +2,4 @@

from .base import Model
from .builder import MODELS, build_model
from .nlp import BertForSequenceClassification
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity

+ 0
- 0
modelscope/models/audio/__init__.py View File


+ 0
- 0
modelscope/models/audio/layers/__init__.py View File


+ 60
- 0
modelscope/models/audio/layers/activations.py View File

@@ -0,0 +1,60 @@
import torch.nn as nn

from .layer_base import LayerBase


class RectifiedLinear(LayerBase):

def __init__(self, input_dim, output_dim):
super(RectifiedLinear, self).__init__()
self.dim = input_dim
self.relu = nn.ReLU()

def forward(self, input):
return self.relu(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
return re_str

def load_kaldi_nnet(self, instr):
return instr


class LogSoftmax(LayerBase):

def __init__(self, input_dim, output_dim):
super(LogSoftmax, self).__init__()
self.dim = input_dim
self.ls = nn.LogSoftmax()

def forward(self, input):
return self.ls(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
return re_str

def load_kaldi_nnet(self, instr):
return instr


class Sigmoid(LayerBase):

def __init__(self, input_dim, output_dim):
super(Sigmoid, self).__init__()
self.dim = input_dim
self.sig = nn.Sigmoid()

def forward(self, input):
return self.sig(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
return re_str

def load_kaldi_nnet(self, instr):
return instr

+ 78
- 0
modelscope/models/audio/layers/affine_transform.py View File

@@ -0,0 +1,78 @@
import numpy as np
import torch as th
import torch.nn as nn

from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
to_kaldi_matrix)


class AffineTransform(LayerBase):

def __init__(self, input_dim, output_dim):
super(AffineTransform, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.linear = nn.Linear(input_dim, output_dim)

def forward(self, input):
return self.linear(input)

def to_kaldi_nnet(self):
re_str = ''
re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
self.input_dim)
re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
return re_str

def to_raw_nnet(self, fid):
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
x.tofile(fid)

linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
x.tofile(fid)

def load_kaldi_nnet(self, instr):
output = expect_token_number(
instr,
'<LearnRateCoef>',
)
if output is None:
raise Exception('AffineTransform format error for <LearnRateCoef>')
instr, lr = output

output = expect_token_number(instr, '<BiasLearnRateCoef>')
if output is None:
raise Exception(
'AffineTransform format error for <BiasLearnRateCoef>')
instr, lr = output

output = expect_token_number(instr, '<MaxNorm>')
if output is None:
raise Exception('AffineTransform format error for <MaxNorm>')
instr, lr = output

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('AffineTransform format error for parsing matrix')
instr, mat = output

print(mat.shape)
self.linear.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('AffineTransform format error for parsing matrix')
instr, mat = output
mat = np.squeeze(mat)
self.linear.bias = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))
return instr

+ 178
- 0
modelscope/models/audio/layers/deep_fsmn.py View File

@@ -0,0 +1,178 @@
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F

from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
to_kaldi_matrix)


class DeepFsmn(LayerBase):

def __init__(self,
input_dim,
output_dim,
lorder=None,
rorder=None,
hidden_size=None,
layer_norm=False,
dropout=0):
super(DeepFsmn, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim

if lorder is None:
return

self.lorder = lorder
self.rorder = rorder
self.hidden_size = hidden_size
self.layer_norm = layer_norm

self.linear = nn.Linear(input_dim, hidden_size)
self.norm = nn.LayerNorm(hidden_size)
self.drop1 = nn.Dropout(p=dropout)
self.drop2 = nn.Dropout(p=dropout)
self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.conv1 = nn.Conv2d(
output_dim,
output_dim, [lorder, 1], [1, 1],
groups=output_dim,
bias=False)
self.conv2 = nn.Conv2d(
output_dim,
output_dim, [rorder, 1], [1, 1],
groups=output_dim,
bias=False)

def forward(self, input):

f1 = F.relu(self.linear(input))

f1 = self.drop1(f1)
if self.layer_norm:
f1 = self.norm(f1)

p1 = self.project(f1)

x = th.unsqueeze(p1, 1)

x_per = x.permute(0, 3, 2, 1)

y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
yr = F.pad(x_per, [0, 0, 0, self.rorder])
yr = yr[:, :, 1:, :]

out = x_per + self.conv1(y) + self.conv2(yr)
out = self.drop2(out)

out1 = out.permute(0, 3, 2, 1)

return input + out1.squeeze()

def to_kaldi_nnet(self):
re_str = ''
re_str += '<UniDeepFsmn> %d %d\n'\
% (self.output_dim, self.input_dim)
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
% (1, self.hidden_size, self.lorder, 1)
lfiters = self.state_dict()['conv1.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
re_str += to_kaldi_matrix(x)
proj_weights = self.state_dict()['project.weight']
x = proj_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
return re_str

def load_kaldi_nnet(self, instr):
output = expect_token_number(
instr,
'<LearnRateCoef>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
instr, lr = output

output = expect_token_number(
instr,
'<HidSize>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <HidSize>')
instr, hiddensize = output
self.hidden_size = int(hiddensize)

output = expect_token_number(
instr,
'<LOrder>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LOrder>')
instr, lorder = output
self.lorder = int(lorder)

output = expect_token_number(
instr,
'<LStride>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LStride>')
instr, lstride = output
self.lstride = lstride

output = expect_token_number(
instr,
'<MaxNorm>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <MaxNorm>')

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
mat1 = np.fliplr(mat.T).copy()
self.conv1 = nn.Conv2d(
self.output_dim,
self.output_dim, [self.lorder, 1], [1, 1],
groups=self.output_dim,
bias=False)
mat_th = th.from_numpy(mat1).type(th.FloatTensor)
mat_th = mat_th.unsqueeze(1)
mat_th = mat_th.unsqueeze(3)
self.conv1.weight = th.nn.Parameter(mat_th)

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output

self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
self.linear = nn.Linear(self.input_dim, self.hidden_size)

self.project.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
self.linear.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
self.linear.bias = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

return instr

+ 50
- 0
modelscope/models/audio/layers/layer_base.py View File

@@ -0,0 +1,50 @@
import abc
import re

import numpy as np
import torch.nn as nn


def expect_token_number(instr, token):
first_token = re.match(r'^\s*' + token, instr)
if first_token is None:
return None
instr = instr[first_token.end():]
lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
if lr is None:
return None
return instr[lr.end():], lr.groups()[0]


def expect_kaldi_matrix(instr):
pos2 = instr.find('[', 0)
pos3 = instr.find(']', pos2)
mat = []
for stt in instr[pos2 + 1:pos3].split('\n'):
tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
if tmp_mat.size > 0:
mat.append(tmp_mat)
return instr[pos3 + 1:], np.array(mat)


def to_kaldi_matrix(np_mat):
"""
function that transform as str numpy mat to standard kaldi str matrix
:param np_mat: numpy mat
:return: str
"""
np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
out_str = str(np_mat)
out_str = out_str.replace('[', '')
out_str = out_str.replace(']', '')
return '[ %s ]\n' % out_str


class LayerBase(nn.Module, metaclass=abc.ABCMeta):

def __init__(self):
super(LayerBase, self).__init__()

@abc.abstractmethod
def to_kaldi_nnet(self):
pass

+ 482
- 0
modelscope/models/audio/layers/uni_deep_fsmn.py View File

@@ -0,0 +1,482 @@
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F

from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
to_kaldi_matrix)


class SepConv(nn.Module):

def __init__(self,
in_channels,
filters,
out_channels,
kernel_size=(5, 2),
dilation=(1, 1)):
""" :param kernel_size (time, frequency)

"""
super(SepConv, self).__init__()
# depthwise + pointwise
self.dconv = nn.Conv2d(
in_channels,
in_channels * filters,
kernel_size,
dilation=dilation,
groups=in_channels)
self.pconv = nn.Conv2d(
in_channels * filters, out_channels, kernel_size=1)
self.padding = dilation[0] * (kernel_size[0] - 1)

def forward(self, input):
''' input: [B, C, T, F]
'''
x = F.pad(input, [0, 0, self.padding, 0])
x = self.dconv(x)
x = self.pconv(x)
return x


class Conv2d(nn.Module):

def __init__(self,
input_dim,
output_dim,
lorder=20,
rorder=0,
groups=1,
bias=False,
skip_connect=True):
super(Conv2d, self).__init__()
self.lorder = lorder
self.conv = nn.Conv2d(
input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
self.rorder = rorder
if self.rorder:
self.conv2 = nn.Conv2d(
input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
self.skip_connect = skip_connect

def forward(self, input):
# [B, 1, T, F]
x = th.unsqueeze(input, 1)
# [B, F, T, 1]
x_per = x.permute(0, 3, 2, 1)
y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
out = self.conv(y)
if self.rorder:
yr = F.pad(x_per, [0, 0, 0, self.rorder])
yr = yr[:, :, 1:, :]
out += self.conv2(yr)
out = out.permute(0, 3, 2, 1).squeeze(1)
if self.skip_connect:
out = out + input
return out


class SelfAttLayer(nn.Module):

def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
super(SelfAttLayer, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim

if lorder is None:
return

self.lorder = lorder
self.hidden_size = hidden_size

self.linear = nn.Linear(input_dim, hidden_size)

self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.att = nn.Linear(input_dim, lorder, bias=False)

def forward(self, input):

f1 = F.relu(self.linear(input))

p1 = self.project(f1)

x = th.unsqueeze(p1, 1)

x_per = x.permute(0, 3, 2, 1)

y = F.pad(x_per, [0, 0, self.lorder - 1, 0])

# z [B, F, T, lorder]
z = x_per
for i in range(1, self.lorder):
z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)

# [B, T, lorder]
att = F.softmax(self.att(input), dim=-1)
att = th.unsqueeze(att, 1)
z = th.sum(z * att, axis=-1)

out1 = z.permute(0, 2, 1)

return input + out1


class TFFsmn(nn.Module):

def __init__(self,
input_dim,
output_dim,
lorder=None,
hidden_size=None,
dilation=1,
layer_norm=False,
dropout=0,
skip_connect=True):
super(TFFsmn, self).__init__()

self.skip_connect = skip_connect

self.linear = nn.Linear(input_dim, hidden_size)
self.norm = nn.Identity()
if layer_norm:
self.norm = nn.LayerNorm(input_dim)
self.act = nn.ReLU()
self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.conv1 = nn.Conv2d(
output_dim,
output_dim, [lorder, 1],
dilation=[dilation, 1],
groups=output_dim,
bias=False)
self.padding_left = dilation * (lorder - 1)
dorder = 5
self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
self.padding_freq = dorder - 1

def forward(self, input):
return self.compute1(input)

def compute1(self, input):
''' linear-dconv-relu(norm)-linear-dconv
'''
x = self.linear(input)
# [B, 1, F, T]
x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
z = F.pad(x, [0, 0, self.padding_freq, 0])
z = self.conv2(z) + x
x = z.permute(0, 3, 2, 1).squeeze(-1)
x = self.act(x)
x = self.norm(x)
x = self.project(x)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
# [B, F, T+lorder-1, 1]
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()

return input + out


class CNNFsmn(nn.Module):
''' use cnn to reduce parameters
'''

def __init__(self,
input_dim,
output_dim,
lorder=None,
hidden_size=None,
dilation=1,
layer_norm=False,
dropout=0,
skip_connect=True):
super(CNNFsmn, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim
self.skip_connect = skip_connect

if lorder is None:
return

self.lorder = lorder
self.hidden_size = hidden_size

self.linear = nn.Linear(input_dim, hidden_size)
self.act = nn.ReLU()
kernel_size = (3, 8)
stride = (1, 4)
self.conv = nn.Sequential(
nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))

self.dconv = nn.Conv2d(
output_dim,
output_dim, [lorder, 1],
dilation=[dilation, 1],
groups=output_dim,
bias=False)
self.padding_left = dilation * (lorder - 1)

def forward(self, input):
return self.compute2(input)

def compute1(self, input):
''' linear-relu(norm)-conv2d-relu?-dconv
'''
# [B, T, F]
x = self.linear(input)
x = self.act(x)
x = th.unsqueeze(x, 1)
x = self.conv(x)
# [B, C, T, F] -> [B, 1, T, F]
b, c, t, f = x.shape
x = x.view([b, 1, t, -1])
x = x.permute(0, 3, 2, 1)
# [B, F, T+lorder-1, 1]
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.dconv(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
return input + out

def compute2(self, input):
''' conv2d-relu-linear-relu?-dconv
'''
x = th.unsqueeze(input, 1)
x = self.conv(x)
x = self.act(x)
# [B, C, T, F] -> [B, T, F]
b, c, t, f = x.shape
x = x.view([b, t, -1])
x = self.linear(x)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.dconv(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
return input + out


class UniDeepFsmn(LayerBase):

def __init__(self,
input_dim,
output_dim,
lorder=None,
hidden_size=None,
dilation=1,
layer_norm=False,
dropout=0,
skip_connect=True):
super(UniDeepFsmn, self).__init__()

self.input_dim = input_dim
self.output_dim = output_dim
self.skip_connect = skip_connect

if lorder is None:
return

self.lorder = lorder
self.hidden_size = hidden_size

self.linear = nn.Linear(input_dim, hidden_size)
self.norm = nn.Identity()
if layer_norm:
self.norm = nn.LayerNorm(input_dim)
self.act = nn.ReLU()
self.project = nn.Linear(hidden_size, output_dim, bias=False)

self.conv1 = nn.Conv2d(
output_dim,
output_dim, [lorder, 1],
dilation=[dilation, 1],
groups=output_dim,
bias=False)
self.padding_left = dilation * (lorder - 1)

def forward(self, input):
return self.compute1(input)

def compute1(self, input):
''' linear-relu(norm)-linear-dconv
'''
# [B, T, F]
x = self.linear(input)
x = self.act(x)
x = self.norm(x)
x = self.project(x)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
# [B, F, T+lorder-1, 1]
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()

return input + out

def compute2(self, input):
''' linear-dconv-linear-relu(norm)
'''
x = self.project(input)
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
x = self.linear(out)
x = self.act(x)
x = self.norm(x)

return input + x

def compute3(self, input):
''' dconv-linear-relu(norm)-linear
'''
x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
y = F.pad(x, [0, 0, self.padding_left, 0])
out = self.conv1(y)
if self.skip_connect:
out = out + x
out = out.permute(0, 3, 2, 1).squeeze()
x = self.linear(out)
x = self.act(x)
x = self.norm(x)
x = self.project(x)

return input + x

def to_kaldi_nnet(self):
re_str = ''
re_str += '<UniDeepFsmn> %d %d\n' \
% (self.output_dim, self.input_dim)
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
% (1, self.hidden_size, self.lorder, 1)
lfiters = self.state_dict()['conv1.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
re_str += to_kaldi_matrix(x)
proj_weights = self.state_dict()['project.weight']
x = proj_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
re_str += to_kaldi_matrix(x)
linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
re_str += to_kaldi_matrix(x)
return re_str

def to_raw_nnet(self, fid):
lfiters = self.state_dict()['conv1.weight']
x = np.flipud(lfiters.squeeze().numpy().T)
x.tofile(fid)

proj_weights = self.state_dict()['project.weight']
x = proj_weights.squeeze().numpy()
x.tofile(fid)

linear_weights = self.state_dict()['linear.weight']
x = linear_weights.squeeze().numpy()
x.tofile(fid)

linear_bias = self.state_dict()['linear.bias']
x = linear_bias.squeeze().numpy()
x.tofile(fid)

def load_kaldi_nnet(self, instr):
output = expect_token_number(
instr,
'<LearnRateCoef>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
instr, lr = output

output = expect_token_number(
instr,
'<HidSize>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <HidSize>')
instr, hiddensize = output
self.hidden_size = int(hiddensize)

output = expect_token_number(
instr,
'<LOrder>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LOrder>')
instr, lorder = output
self.lorder = int(lorder)

output = expect_token_number(
instr,
'<LStride>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <LStride>')
instr, lstride = output
self.lstride = lstride

output = expect_token_number(
instr,
'<MaxNorm>',
)
if output is None:
raise Exception('UniDeepFsmn format error for <MaxNorm>')

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
mat1 = np.fliplr(mat.T).copy()

self.conv1 = nn.Conv2d(
self.output_dim,
self.output_dim, [self.lorder, 1], [1, 1],
groups=self.output_dim,
bias=False)

mat_th = th.from_numpy(mat1).type(th.FloatTensor)
mat_th = mat_th.unsqueeze(1)
mat_th = mat_th.unsqueeze(3)
self.conv1.weight = th.nn.Parameter(mat_th)

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output

self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
self.linear = nn.Linear(self.input_dim, self.hidden_size)

self.project.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
self.linear.weight = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

output = expect_kaldi_matrix(instr)
if output is None:
raise Exception('UniDeepFsmn format error for parsing matrix')
instr, mat = output
mat = np.squeeze(mat)
self.linear.bias = th.nn.Parameter(
th.from_numpy(mat).type(th.FloatTensor))

return instr

+ 0
- 0
modelscope/models/audio/network/__init__.py View File


+ 394
- 0
modelscope/models/audio/network/loss.py View File

@@ -0,0 +1,394 @@
import torch
import torch.nn.functional as F

from .modulation_loss import (GaborSTRFConv, MelScale,
ModulationDomainLossModule)

EPS = 1e-8


def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
'''
stft: (batch, ..., 2) or complex(batch, ...)
y = x + n
'''
if torch.is_complex(mixed_spec):
yr, yi = mixed_spec.real, mixed_spec.imag
else:
yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
if torch.is_complex(clean_spec):
xr, xi = clean_spec.real, clean_spec.imag
else:
xr, xi = clean_spec[..., 0], clean_spec[..., 1]

if mask_type == 'iam':
ymag = torch.sqrt(yr**2 + yi**2)
xmag = torch.sqrt(xr**2 + xi**2)
iam = xmag / (ymag + EPS)
return torch.clamp(iam, 0, 1)

elif mask_type == 'psm':
ypow = yr**2 + yi**2
psm = (xr * yr + xi * yi) / (ypow + EPS)
return torch.clamp(psm, 0, 1)

elif mask_type == 'psmiam':
ypow = yr**2 + yi**2
psm = (xr * yr + xi * yi) / (ypow + EPS)
ymag = torch.sqrt(yr**2 + yi**2)
xmag = torch.sqrt(xr**2 + xi**2)
iam = xmag / (ymag + EPS)
psmiam = psm * iam
return torch.clamp(psmiam, 0, 1)

elif mask_type == 'crm':
ypow = yr**2 + yi**2
mr = (xr * yr + xi * yi) / (ypow + EPS)
mi = (xi * yr - xr * yi) / (ypow + EPS)
mr = torch.clamp(mr, -clip, clip)
mi = torch.clamp(mi, -clip, clip)
return mr, mi


def energy_vad(spec,
thdhigh=320 * 600 * 600 * 2,
thdlow=320 * 300 * 300 * 2,
int16=True):
'''
energy based vad should be accurate enough
spec: (batch, bins, frames, 2)
returns (batch, frames)
'''
energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
vad = energy > thdhigh
idx = torch.logical_and(vad == 0, energy > thdlow)
vad[idx] = 0.5
return vad


def modulation_loss_init(n_fft):
gabor_strf_parameters = torch.load(
'./network/gabor_strf_parameters.pt')['state_dict']
gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)

modulation_loss_module = ModulationDomainLossModule(
gabor_modulation_kernels.eval())
for param in modulation_loss_module.parameters():
param.requires_grad = False

stft2mel = MelScale(
n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()

return modulation_loss_module, stft2mel


def mask_loss_function(
loss_func='psm_loss',
loss_type='mse', # ['mse', 'mae', 'comb']
mask_type='psmiam',
use_mod_loss=False,
use_wav2vec_loss=False,
n_fft=640,
hop_length=320,
EPS=1e-8,
weight=None):
if weight is not None:
print(f'Use loss weight: {weight}')
winlen = n_fft
window = torch.hamming_window(winlen, periodic=False)

def stft(x, return_complex=False):
# returns [batch, bins, frames, 2]
return torch.stft(
x,
n_fft,
hop_length,
winlen,
window=window.to(x.device),
center=False,
return_complex=return_complex)

def istft(x, slen):
return torch.istft(
x,
n_fft,
hop_length,
winlen,
window=window.to(x.device),
center=False,
length=slen)

def mask_loss(targets, masks, nframes):
''' [Batch, Time, Frequency]
'''
with torch.no_grad():
mask_for_loss = torch.ones_like(targets)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks = masks * mask_for_loss
targets = targets * mask_for_loss

if weight is None:
alpha = 1
else: # for aec ST
alpha = weight - targets

if loss_type == 'mse':
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
elif loss_type == 'mae':
loss = torch.sum(alpha * torch.abs(targets - masks))
else: # mse(mask), mae(mask) approx 1:2
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
+ 0.1 * alpha * torch.abs(targets - masks))
loss /= torch.sum(nframes)
return loss

def spectrum_loss(targets, spec, nframes):
''' [Batch, Time, Frequency, 2]
'''
with torch.no_grad():
mask_for_loss = torch.ones_like(targets[..., 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
xr = spec[..., 0] * mask_for_loss
xi = spec[..., 1] * mask_for_loss
yr = targets[..., 0] * mask_for_loss
yi = targets[..., 1] * mask_for_loss
xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
ymag = torch.sqrt(targets[..., 0]**2
+ targets[..., 1]**2) * mask_for_loss

loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
loss2 = torch.sum(torch.pow(xmag - ymag, 2))

loss = (loss1 + loss2) / torch.sum(nframes)
return loss

def sa_loss_dlen(mixed, clean, masks, nframes):
yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
with torch.no_grad():
mask_for_loss = torch.ones_like(xspec[..., 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
emag = emag * mask_for_loss
xmag = xmag * mask_for_loss

loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
return loss

def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed)
clean_spec = stft(clean)
targets = compute_mask(mixed_spec, clean_spec, mask_type)
# [B, T, F]
targets = targets.permute(0, 2, 1)

loss = mask_loss(targets, masks, nframes)

if subtask is not None:
vadtargets = energy_vad(clean_spec)
with torch.no_grad():
mask_for_loss = torch.ones_like(targets[:, :, 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:] = 0
subtask = subtask[:, :, 0] * mask_for_loss
vadtargets = vadtargets * mask_for_loss

loss_vad = F.binary_cross_entropy(subtask, vadtargets)
return loss + loss_vad
return loss

def modulation_loss(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed, True)
clean_spec = stft(clean, True)
enhanced_mag = torch.abs(mixed_spec)
clean_mag = torch.abs(clean_spec)
with torch.no_grad():
mask_for_loss = torch.ones_like(clean_mag)
for idx, num in enumerate(nframes):
mask_for_loss[idx, :, num:] = 0
clean_mag = clean_mag * mask_for_loss
enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])

# Covert to log-mel representation
# (B,T,#mel_channels)
clean_log_mel = torch.log(
torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
enhanced_log_mel = torch.log(
torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)

alpha = compute_mask(mixed_spec, clean_spec, mask_type)
alpha = alpha.permute(0, 2, 1)
loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
alpha)
loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
# print(loss.item(), loss2.item()) #approx 1:4
loss = loss + loss2
return loss

def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
mixed /= 32768
clean /= 32768
mixed_spec = stft(mixed)
with torch.no_grad():
mask_for_loss = torch.ones_like(masks)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks_est = masks * mask_for_loss

estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
est_clean = istft(estimate, clean.shape[1])
loss = wav2vec_loss_module(est_clean, clean)
return loss

def sisdr_loss_dlen(mixed,
clean,
masks,
nframes,
subtask=None,
zero_mean=True):
mixed_spec = stft(mixed)
with torch.no_grad():
mask_for_loss = torch.ones_like(masks)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks_est = masks * mask_for_loss

estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
est_clean = istft(estimate, clean.shape[1])
flen = min(clean.shape[1], est_clean.shape[1])
clean = clean[:, :flen]
est_clean = est_clean[:, :flen]

# follow asteroid/losses/sdr.py
if zero_mean:
clean = clean - torch.mean(clean, dim=1, keepdim=True)
est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)

dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
scaled_clean = dot * clean / s_clean_energy
e_noise = est_clean - scaled_clean

# [batch]
sisdr = torch.sum(
scaled_clean**2, dim=1) / (
torch.sum(e_noise**2, dim=1) + EPS)
sisdr = -10 * torch.log10(sisdr + EPS)
loss = sisdr.mean()
return loss

def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed)
clean_spec = stft(clean)
with torch.no_grad():
mask_for_loss = torch.ones_like(masks)
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
masks_est = masks * mask_for_loss

estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)

dot_real = estimate[..., 0] * clean_spec[..., 0] + \
estimate[..., 1] * clean_spec[..., 1]
dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
estimate[..., 1] * clean_spec[..., 0]
dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
s_clean_energy = clean_spec[..., 0] ** 2 + \
clean_spec[..., 1] ** 2 + EPS
scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
e_noise = estimate - scaled_clean

# [batch]
scaled_clean_energy = torch.sum(
scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
e_noise_energy = torch.sum(
e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
sisdr = torch.sum(
scaled_clean_energy, dim=1) / (
torch.sum(e_noise_energy, dim=1) + EPS)
sisdr = -10 * torch.log10(sisdr + EPS)
loss = sisdr.mean()
return loss

def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
mixed_spec = stft(mixed).permute([0, 2, 1, 3])
clean_spec = stft(clean).permute([0, 2, 1, 3])
mixed_spec = mixed_spec / 32768
clean_spec = clean_spec / 32768
tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')

D = int(masks.shape[2] / 2)
with torch.no_grad():
mask_for_loss = torch.ones_like(clean_spec[..., 0])
for idx, num in enumerate(nframes):
mask_for_loss[idx, num:, :] = 0
mr = masks[..., :D] * mask_for_loss
mi = masks[..., D:] * mask_for_loss
tgt_mr = tgt_mr * mask_for_loss
tgt_mi = tgt_mi * mask_for_loss

if weight is None:
alpha = 1
else:
alpha = weight - tgt_mr
# signal approximation
yr = mixed_spec[..., 0]
yi = mixed_spec[..., 1]
loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
+ torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
# mask approximation
loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
+ torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
return loss

def crm_miso_loss_dlen(mixed, clean, masks, nframes):
return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)

def mimo_loss_dlen(mixed, clean, masks, nframes):
chs = mixed.shape[-1]
D = masks.shape[2] // chs
loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
nframes)
for ch in range(1, chs):
loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
masks[..., ch * D:ch * D + D], nframes)
loss = loss + loss1
return loss / chs

def spec_loss_dlen(mixed, clean, spec, nframes):
clean_spec = stft(clean).permute([0, 2, 1, 3])
clean_spec = clean_spec / 32768

D = spec.shape[2] // 2
spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
dim=-1)
loss = spectrum_loss(clean_spec, spec_est, nframes)
return loss

if loss_func == 'psm_vad_loss_dlen':
return psm_vad_loss_dlen
elif loss_func == 'sisdr_loss_dlen':
return sisdr_loss_dlen
elif loss_func == 'sisdr_freq_loss_dlen':
return sisdr_freq_loss_dlen
elif loss_func == 'crm_loss_dlen':
return crm_loss_dlen
elif loss_func == 'modulation_loss':
return modulation_loss
elif loss_func == 'wav2vec_loss':
return wav2vec_loss
elif loss_func == 'mimo_loss_dlen':
return mimo_loss_dlen
elif loss_func == 'spec_loss_dlen':
return spec_loss_dlen
elif loss_func == 'sa_loss_dlen':
return sa_loss_dlen
else:
print('error loss func')
return None

+ 248
- 0
modelscope/models/audio/network/modulation_loss.py View File

@@ -0,0 +1,248 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.transforms import MelScale


class ModulationDomainLossModule(torch.nn.Module):
"""Modulation-domain loss function developed in [1] for supervised speech enhancement

In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
as the input spectrogram representation.
Specific parameter details are in the paper and in the example below

Parameters
----------
modulation_kernels: nn.Module
Differentiable module that transforms a spectrogram representation to the modulation domain

modulation_domain = modulation_kernels(input_tf_representation)
Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')

norm: boolean
Normalizes the modulation domain representation to be 0 mean across time

[1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
speech enhancement”
Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330


"""

def __init__(self, modulation_kernels, norm=True):
super(ModulationDomainLossModule, self).__init__()

self.modulation_kernels = modulation_kernels
self.mse = nn.MSELoss(reduce=False)
self.norm = norm

def forward(self, enhanced_spect, clean_spect, weight=None):
"""Calculate modulation-domain loss
Args:
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
Returns:
Tensor: Modulation-domain loss value.
"""

clean_mod = self.modulation_kernels(clean_spect)
enhanced_mod = self.modulation_kernels(enhanced_spect)

if self.norm:
mean_clean_mod = torch.mean(clean_mod, dim=2)
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

if weight is None:
alpha = 1
else: # TF-mask weight
alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
mod_mse_loss = torch.mean(
torch.sum(mod_mse_loss, dim=(1, 2, 3))
/ torch.sum(clean_mod**2, dim=(1, 2, 3)))

return mod_mse_loss


class ModulationDomainNCCLossModule(torch.nn.Module):
"""Modulation-domain loss function developed in [1] for supervised speech enhancement

# Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this

In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
as the input spectrogram representation.
Specific parameter details are in the paper and in the example below

Parameters
----------
modulation_kernels: nn.Module
Differentiable module that transforms a spectrogram representation to the modulation domain

modulation_domain = modulation_kernels(input_tf_representation)
Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')

[1]

"""

def __init__(self, modulation_kernels):
super(ModulationDomainNCCLossModule, self).__init__()

self.modulation_kernels = modulation_kernels
self.mse = nn.MSELoss(reduce=False)

def forward(self, enhanced_spect, clean_spect):
"""Calculate modulation-domain loss
Args:
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
Returns:
Tensor: Modulation-domain loss value.
"""

clean_mod = self.modulation_kernels(clean_spect)
enhanced_mod = self.modulation_kernels(enhanced_spect)
mean_clean_mod = torch.mean(clean_mod, dim=2)
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

inner_product = torch.sum(
normalized_clean * normalized_enhanced, dim=2)
normalized_denom = (torch.sum(
normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
normalized_enhanced * normalized_enhanced, dim=2))**.5

ncc = inner_product / normalized_denom
mod_mse_loss = torch.mean((ncc - 1.0)**2)

return mod_mse_loss


class GaborSTRFConv(nn.Module):
"""Gabor-STRF-based cross-correlation kernel."""

def __init__(self,
supn,
supk,
nkern,
rates=None,
scales=None,
norm_strf=True,
real_only=False):
"""Instantiate a Gabor-based STRF convolution layer.
Parameters
----------
supn: int
Time support in number of frames. Also the window length.
supk: int
Frequency support in number of channels. Also the window length.
nkern: int
Number of kernels, each with a learnable rate and scale.
rates: list of float, None
Initial values for temporal modulation.
scales: list of float, None
Initial values for spectral modulation.
norm_strf: Boolean
Normalize STRF kernels to be unit length
real_only: Boolean
If True, nkern REAL gabor-STRF kernels
If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
"""
super(GaborSTRFConv, self).__init__()
self.numN = supn
self.numK = supk
self.numKern = nkern
self.real_only = real_only
self.norm_strf = norm_strf

if not real_only:
nkern = nkern // 2

if supk % 2 == 0: # force odd number
supk += 1
self.supk = torch.arange(supk, dtype=torch.float32)
if supn % 2 == 0: # force odd number
supn += 1
self.supn = torch.arange(supn, dtype=self.supk.dtype)
self.padding = (supn // 2, supk // 2)
# Set up learnable parameters
# for param in (rates, scales):
# assert (not param) or len(param) == nkern
if not rates:

rates = torch.rand(nkern) * math.pi / 2.0

if not scales:

scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0

self.rates_ = nn.Parameter(torch.Tensor(rates))
self.scales_ = nn.Parameter(torch.Tensor(scales))

def strfs(self):
"""Make STRFs using the current parameters."""

if self.supn.device != self.rates_.device: # for first run
self.supn = self.supn.to(self.rates_.device)
self.supk = self.supk.to(self.rates_.device)
n0, k0 = self.padding

nwind = .5 - .5 * \
torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
kwind = .5 - .5 * \
torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))

new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))

n_n_0 = self.supn - n0
k_k_0 = self.supk - k0
n_mult = torch.matmul(
n_n_0.unsqueeze(1),
torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
self.rates_.device))
k_mult = torch.matmul(
torch.ones((len(self.supn),
1)).type(torch.FloatTensor).to(self.rates_.device),
k_k_0.unsqueeze(0))

inside = self.rates_.unsqueeze(1).unsqueeze(
1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
real_strf = torch.cos(inside) * new_wind.unsqueeze(0)

if self.real_only:
final_strf = real_strf

else:
imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
final_strf = torch.cat([real_strf, imag_strf], dim=0)

if self.norm_strf:
final_strf = final_strf / (torch.sum(
final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5

return final_strf

def forward(self, sigspec):
"""Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
if len(sigspec.shape) == 2: # expand batch dimension if single eg
sigspec = sigspec.unsqueeze(0)
strfs = self.strfs().unsqueeze(1).type_as(sigspec)
out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
return out

def __repr__(self):
"""Gabor filter"""
report = """
+++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++

""".format(self.numKern, self.numN, self.numK, self.real_only,
self.norm_strf)

return report

+ 483
- 0
modelscope/models/audio/network/se_net.py View File

@@ -0,0 +1,483 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

from ..layers.activations import RectifiedLinear, Sigmoid
from ..layers.affine_transform import AffineTransform
from ..layers.deep_fsmn import DeepFsmn
from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn


class MaskNet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
hidden_dim=128,
hidden_dim2=None,
lorder=20,
rorder=0,
dilation=1,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(MaskNet, self).__init__()

self.linear1 = AffineTransform(indim, hidden_dim)
self.relu = RectifiedLinear(hidden_dim, hidden_dim)
if hidden_dim2 is None:
hidden_dim2 = hidden_dim

if rorder == 0:
repeats = [
UniDeepFsmn(
hidden_dim,
hidden_dim,
lorder,
hidden_dim2,
dilation=dilation,
layer_norm=layer_norm,
dropout=dropout) for i in range(layers)
]
else:
repeats = [
DeepFsmn(
hidden_dim,
hidden_dim,
lorder,
rorder,
hidden_dim2,
layer_norm=layer_norm,
dropout=dropout) for i in range(layers)
]
self.deepfsmn = nn.Sequential(*repeats)

self.linear2 = AffineTransform(hidden_dim, outdim)

self.crm = crm
if self.crm:
self.sig = nn.Tanh()
else:
self.sig = Sigmoid(outdim, outdim)

self.vad = vad
if self.vad:
self.linear3 = AffineTransform(hidden_dim, 1)

self.layers = layers
self.linearout = linearout
if self.linearout and self.vad:
print('Warning: not supported nnet')

def forward(self, feat, ctl=None):
x1 = self.linear1(feat)
x2 = self.relu(x1)
if ctl is not None:
ctl = min(ctl, self.layers - 1)
for i in range(ctl):
x2 = self.deepfsmn[i](x2)
mask = self.sig(self.linear2(x2))
if self.vad:
vad = torch.sigmoid(self.linear3(x2))
return mask, vad
else:
return mask
x3 = self.deepfsmn(x2)
if self.linearout:
return self.linear2(x3)
mask = self.sig(self.linear2(x3))
if self.vad:
vad = torch.sigmoid(self.linear3(x3))
return mask, vad
else:
return mask

def to_kaldi_nnet(self):
re_str = ''
re_str += '<Nnet>\n'
re_str += self.linear1.to_kaldi_nnet()
re_str += self.relu.to_kaldi_nnet()
for dfsmn in self.deepfsmn:
re_str += dfsmn.to_kaldi_nnet()
re_str += self.linear2.to_kaldi_nnet()
re_str += self.sig.to_kaldi_nnet()
re_str += '</Nnet>\n'

return re_str

def to_raw_nnet(self, fid):
self.linear1.to_raw_nnet(fid)
for dfsmn in self.deepfsmn:
dfsmn.to_raw_nnet(fid)
self.linear2.to_raw_nnet(fid)


class StageNet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
layers2=6,
hidden_dim=128,
lorder=20,
rorder=0,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(StageNet, self).__init__()

self.stage1 = nn.ModuleList()
self.stage2 = nn.ModuleList()
layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
self.stage1.append(layer)
for i in range(layers):
layer = UniDeepFsmn(
hidden_dim,
hidden_dim,
lorder,
hidden_dim,
layer_norm=layer_norm,
dropout=dropout)
self.stage1.append(layer)
layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
self.stage1.append(layer)
# stage2
layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
self.stage2.append(layer)
for i in range(layers2):
layer = UniDeepFsmn(
hidden_dim,
hidden_dim,
lorder,
hidden_dim,
layer_norm=layer_norm,
dropout=dropout)
self.stage2.append(layer)
layer = nn.Sequential(
nn.Linear(hidden_dim, outdim),
nn.Sigmoid() if not crm else nn.Tanh())
self.stage2.append(layer)
self.crm = crm
self.vad = vad
self.linearout = linearout
self.window = torch.hamming_window(640, periodic=False).cuda()
self.freezed = False

def freeze(self):
if not self.freezed:
for param in self.stage1.parameters():
param.requires_grad = False
self.freezed = True
print('freezed stage1')

def forward(self, feat, mixture, ctl=None):
if ctl == 'off':
x = feat
for i in range(len(self.stage1)):
x = self.stage1[i](x)
return x
else:
self.freeze()
x = feat
for i in range(len(self.stage1)):
x = self.stage1[i](x)

spec = torch.stft(
mixture / 32768,
640,
320,
640,
self.window,
center=False,
return_complex=True)
spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
est = x * specmag
y = torch.cat([est, feat], dim=-1)
for i in range(len(self.stage2)):
y = self.stage2[i](y)
return y


class Unet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
dims=[256] * 4,
lorder=20,
rorder=0,
dilation=1,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(Unet, self).__init__()

self.linear1 = AffineTransform(indim, dims[0])
self.relu = RectifiedLinear(dims[0], dims[0])

self.encoder = nn.ModuleList()
self.decoder = nn.ModuleList()
for i in range(len(dims) - 1):
layer = nn.Sequential(
nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
nn.Linear(dims[i + 1], dims[i + 1], bias=False),
Conv2d(
dims[i + 1],
dims[i + 1],
lorder,
groups=dims[i + 1],
skip_connect=True))
self.encoder.append(layer)
for i in range(len(dims) - 1, 0, -1):
layer = nn.Sequential(
nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
nn.Linear(dims[i - 1], dims[i - 1], bias=False),
Conv2d(
dims[i - 1],
dims[i - 1],
lorder,
groups=dims[i - 1],
skip_connect=True))
self.decoder.append(layer)
self.tf = nn.ModuleList()
for i in range(layers - 2 * (len(dims) - 1)):
layer = nn.Sequential(
nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
nn.Linear(dims[-1], dims[-1], bias=False),
Conv2d(
dims[-1],
dims[-1],
lorder,
groups=dims[-1],
skip_connect=True))
self.tf.append(layer)

self.linear2 = AffineTransform(dims[0], outdim)
self.crm = crm
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
self.vad = False
self.layers = layers
self.linearout = linearout

def forward(self, x, ctl=None):
x = self.linear1(x)
x = self.relu(x)

encoder_out = []
for i in range(len(self.encoder)):
x = self.encoder[i](x)
encoder_out.append(x)
for i in range(len(self.tf)):
x = self.tf[i](x)
for i in range(len(self.decoder)):
x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
x = self.decoder[i](x)

x = self.linear2(x)
if self.linearout:
return x
return self.act(x)


class BranchNet(nn.Module):

def __init__(self,
indim,
outdim,
layers=9,
hidden_dim=256,
lorder=20,
rorder=0,
dilation=1,
layer_norm=False,
dropout=0,
crm=False,
vad=False,
linearout=False):
super(BranchNet, self).__init__()

self.linear1 = AffineTransform(indim, hidden_dim)
self.relu = RectifiedLinear(hidden_dim, hidden_dim)

self.convs = nn.ModuleList()
self.deepfsmn = nn.ModuleList()
self.FREQ = nn.ModuleList()
self.TIME = nn.ModuleList()
self.br1 = nn.ModuleList()
self.br2 = nn.ModuleList()
for i in range(layers):
'''
layer = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim, bias=False),
Conv2d(hidden_dim, hidden_dim, lorder,
groups=hidden_dim, skip_connect=True)
)
self.deepfsmn.append(layer)
'''
layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
self.FREQ.append(layer)
'''
layer = nn.GRU(hidden_dim, hidden_dim,
batch_first=True,
bidirectional=False)
self.TIME.append(layer)

layer = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim//2, bias=False),
Conv2d(hidden_dim//2, hidden_dim//2, lorder,
groups=hidden_dim//2, skip_connect=True)
)
self.br1.append(layer)
layer = nn.GRU(hidden_dim, hidden_dim//2,
batch_first=True,
bidirectional=False)
self.br2.append(layer)
'''

self.linear2 = AffineTransform(hidden_dim, outdim)
self.crm = crm
self.act = nn.Tanh() if self.crm else nn.Sigmoid()
self.vad = False
self.layers = layers
self.linearout = linearout

def forward(self, x, ctl=None):
return self.forward_branch(x)

def forward_sepconv(self, x):
x = torch.unsqueeze(x, 1)
for i in range(len(self.convs)):
x = self.convs[i](x)
x = F.relu(x)
B, C, H, W = x.shape
x = x.permute(0, 2, 1, 3)
x = torch.reshape(x, [B, H, C * W])
x = self.linear1(x)
x = self.relu(x)
for i in range(self.layers):
x = self.deepfsmn[i](x) + x
x = self.linear2(x)
return self.act(x)

def forward_branch(self, x):
x = self.linear1(x)
x = self.relu(x)
for i in range(self.layers):
z = self.FREQ[i](x)
x = z + x
x = self.linear2(x)
if self.linearout:
return x
return self.act(x)


class TACNet(nn.Module):
''' transform average concatenate for ad hoc dr
'''

def __init__(self,
indim,
outdim,
layers=9,
hidden_dim=128,
lorder=20,
rorder=0,
crm=False,
vad=False,
linearout=False):
super(TACNet, self).__init__()

self.linear1 = AffineTransform(indim, hidden_dim)
self.relu = RectifiedLinear(hidden_dim, hidden_dim)

if rorder == 0:
repeats = [
UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
for i in range(layers)
]
else:
repeats = [
DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
for i in range(layers)
]
self.deepfsmn = nn.Sequential(*repeats)

self.ch_transform = nn.ModuleList([])
self.ch_average = nn.ModuleList([])
self.ch_concat = nn.ModuleList([])
for i in range(layers):
self.ch_transform.append(
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
self.ch_average.append(
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
self.ch_concat.append(
nn.Sequential(
nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))

self.linear2 = AffineTransform(hidden_dim, outdim)

self.crm = crm
if self.crm:
self.sig = nn.Tanh()
else:
self.sig = Sigmoid(outdim, outdim)

self.vad = vad
if self.vad:
self.linear3 = AffineTransform(hidden_dim, 1)

self.layers = layers
self.linearout = linearout
if self.linearout and self.vad:
print('Warning: not supported nnet')

def forward(self, feat, ctl=None):
B, T, F = feat.shape
# assume 4ch
ch = 4
zlist = []
for c in range(ch):
z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
z = self.relu(z)
zlist.append(z)
for i in range(self.layers):
# forward
for c in range(ch):
zlist[c] = self.deepfsmn[i](zlist[c])

# transform
olist = []
for c in range(ch):
z = self.ch_transform[i](zlist[c])
olist.append(z)
# average
avg = 0
for c in range(ch):
avg = avg + olist[c]
avg = avg / ch
avg = self.ch_average[i](avg)
# concate
for c in range(ch):
tac = torch.cat([olist[c], avg], dim=-1)
tac = self.ch_concat[i](tac)
zlist[c] = zlist[c] + tac

for c in range(ch):
zlist[c] = self.sig(self.linear2(zlist[c]))
mask = torch.cat(zlist, dim=-1)
return mask

def to_kaldi_nnet(self):
pass

+ 17
- 9
modelscope/models/base.py View File

@@ -2,14 +2,13 @@

import os.path as osp
from abc import ABC, abstractmethod
from typing import Dict, List, Tuple, Union
from typing import Dict, Union

from maas_hub.file_download import model_file_download
from maas_hub.snapshot_download import snapshot_download

from modelscope.models.builder import build_model
from modelscope.utils.config import Config
from modelscope.utils.constant import CONFIGFILE
from modelscope.utils.constant import ModelFile
from modelscope.utils.hub import get_model_cache_dir

Tensor = Union['torch.Tensor', 'tf.Tensor']
@@ -21,16 +20,24 @@ class Model(ABC):
self.model_dir = model_dir

def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
return self.post_process(self.forward(input))
return self.postprocess(self.forward(input))

@abstractmethod
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
pass

def post_process(self, input: Dict[str, Tensor],
**kwargs) -> Dict[str, Tensor]:
# model specific postprocess, implementation is optional
# will be called in Pipeline and evaluation loop(in the future)
def postprocess(self, input: Dict[str, Tensor],
**kwargs) -> Dict[str, Tensor]:
""" Model specific postprocess and convert model output to
standard model outputs.

Args:
inputs: input data

Return:
dict of results: a dict containing outputs of model, each
output should have the standard output name.
"""
return input

@classmethod
@@ -47,7 +54,8 @@ class Model(ABC):
# raise ValueError(
# 'Remote model repo {model_name_or_path} does not exists')

cfg = Config.from_file(osp.join(local_model_dir, CONFIGFILE))
cfg = Config.from_file(
osp.join(local_model_dir, ModelFile.CONFIGURATION))
task_name = cfg.task
model_cfg = cfg.model
# TODO @wenmeng.zwm may should manually initialize model after model building


+ 4
- 2
modelscope/models/nlp/__init__.py View File

@@ -1,4 +1,6 @@
from .sequence_classification_model import * # noqa F403
from .bert_for_sequence_classification import * # noqa F403
from .palm_for_text_generation import * # noqa F403
from .sbert_for_sentence_similarity import * # noqa F403
from .sbert_for_token_classification import * # noqa F403
from .space.dialog_intent_prediction_model import * # noqa F403
from .space.dialog_modeling_model import * # noqa F403
from .text_generation_model import * # noqa F403

modelscope/models/nlp/sequence_classification_model.py → modelscope/models/nlp/bert_for_sequence_classification.py View File

@@ -1,5 +1,7 @@
import os
from typing import Any, Dict

import json
import numpy as np

from modelscope.utils.constant import Tasks
@@ -34,6 +36,11 @@ class BertForSequenceClassification(Model):
('token_type_ids', torch.LongTensor)],
output_keys=['predictions', 'probabilities', 'logits'])

self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
with open(self.label_path) as f:
self.label_mapping = json.load(f)
self.id2label = {idx: name for name, idx in self.label_mapping.items()}

def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
"""return the result by the model

@@ -50,3 +57,13 @@ class BertForSequenceClassification(Model):
}
"""
return self.model.predict(input)

def postprocess(self, inputs: Dict[str, np.ndarray],
**kwargs) -> Dict[str, np.ndarray]:
# N x num_classes
probs = inputs['probabilities']
result = {
'probs': probs,
}

return result

+ 43
- 0
modelscope/models/nlp/palm_for_text_generation.py View File

@@ -0,0 +1,43 @@
from typing import Dict

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['PalmForTextGeneration']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
class PalmForTextGeneration(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the text generation model from the `model_dir` path.

Args:
model_dir (str): the model path.
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir

from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
model = PalmForConditionalGeneration.from_pretrained(model_dir)
self.tokenizer = model.tokenizer
self.generator = Translator(model)

def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""return the result by the model

Args:
input (Dict[str, Tensor]): the preprocessed data

Returns:
Dict[str, Tensor]: results
Example:
{
'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
}
"""

return self.generator(**input)

+ 88
- 0
modelscope/models/nlp/sbert_for_sentence_similarity.py View File

@@ -0,0 +1,88 @@
import os
from typing import Any, Dict

import json
import numpy as np
import torch
from sofa import SbertModel
from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
from torch import nn

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['SbertForSentenceSimilarity']


class SbertTextClassifier(SbertPreTrainedModel):

def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.encoder = SbertModel(config, add_pooling_layer=True)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

def forward(self, input_ids=None, token_type_ids=None):
outputs = self.encoder(
input_ids,
token_type_ids=token_type_ids,
return_dict=None,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits


@MODELS.register_module(
Tasks.sentence_similarity,
module_name=r'sbert-base-chinese-sentence-similarity')
class SbertForSentenceSimilarity(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the sentence similarity model from the `model_dir` path.

Args:
model_dir (str): the model path.
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir

self.model = SbertTextClassifier.from_pretrained(
model_dir, num_labels=2)
self.model.eval()
self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
with open(self.label_path) as f:
self.label_mapping = json.load(f)
self.id2label = {idx: name for name, idx in self.label_mapping.items()}

def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
"""return the result by the model

Args:
input (Dict[str, Any]): the preprocessed data

Returns:
Dict[str, np.ndarray]: results
Example:
{
'predictions': array([1]), # lable 0-negative 1-positive
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
}
"""
input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
token_type_ids = torch.tensor(
input['token_type_ids'], dtype=torch.long)
with torch.no_grad():
logits = self.model(input_ids, token_type_ids)
probs = logits.softmax(-1).numpy()
pred = logits.argmax(-1).numpy()
logits = logits.numpy()
res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
return res

+ 56
- 0
modelscope/models/nlp/sbert_for_token_classification.py View File

@@ -0,0 +1,56 @@
from typing import Any, Dict, Union

import numpy as np
import torch
from sofa import SbertConfig, SbertForTokenClassification

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['StructBertForTokenClassification']


@MODELS.register_module(
Tasks.word_segmentation,
module_name=r'structbert-chinese-word-segmentation')
class StructBertForTokenClassification(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the word segmentation model from the `model_dir` path.

Args:
model_dir (str): the model path.
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir
self.model = SbertForTokenClassification.from_pretrained(
self.model_dir)
self.config = SbertConfig.from_pretrained(self.model_dir)

def forward(self, input: Dict[str,
Any]) -> Dict[str, Union[str, np.ndarray]]:
"""return the result by the model

Args:
input (Dict[str, Any]): the preprocessed data

Returns:
Dict[str, Union[str,np.ndarray]]: results
Example:
{
'predictions': array([1,4]), # lable 0-negative 1-positive
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
'text': str(今天),
}
"""
input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
output = self.model(input_ids)
logits = output.logits
pred = torch.argmax(logits[0], dim=-1)
pred = pred.numpy()

rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
return rst

+ 0
- 52
modelscope/models/nlp/text_generation_model.py View File

@@ -1,52 +0,0 @@
from typing import Any, Dict

from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['PalmForTextGenerationModel']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
class PalmForTextGenerationModel(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the text generation model from the `model_dir` path.

Args:
model_dir (str): the model path.
model_cls (Optional[Any], optional): model loader, if None, use the
default loader to load model weights, by default None.
"""
from sofa import PalmTokenizer

super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir

from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
tokenizer = kwargs.pop('tokenizer',
PalmTokenizer.from_pretrained(model_dir))
model = PalmForConditionalGeneration.from_pretrained(model_dir)
self.generator = TextGenerator(model, tokenizer)

def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""return the result by the model

Args:
input (Dict[str, Any]): the preprocessed data

Returns:
Dict[str, np.ndarray]: results
Example:
{
'predictions': array([1]), # lable 0-negative 1-positive
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
}
"""

encoder_inputs = [
input['input_ids'], input['token_type_ids'],
input['attention_mask']
]
return self.generator(encoder_inputs)

+ 1
- 1
modelscope/pipelines/__init__.py View File

@@ -1,4 +1,4 @@
from .audio import * # noqa F403
from .audio import LinearAECPipeline
from .base import Pipeline
from .builder import pipeline
from .cv import * # noqa F403


+ 1
- 0
modelscope/pipelines/audio/__init__.py View File

@@ -0,0 +1 @@
from .linear_aec_pipeline import LinearAECPipeline

+ 160
- 0
modelscope/pipelines/audio/linear_aec_pipeline.py View File

@@ -0,0 +1,160 @@
import importlib
import os
from typing import Any, Dict

import numpy as np
import scipy.io.wavfile as wav
import torch
import yaml

from modelscope.preprocessors.audio import LinearAECAndFbank
from modelscope.utils.constant import ModelFile, Tasks
from ..base import Pipeline
from ..builder import PIPELINES

FEATURE_MVN = 'feature.DEY.mvn.txt'

CONFIG_YAML = 'dey_mini.yaml'


def initialize_config(module_cfg):
r"""According to config items, load specific module dynamically with params.
1. Load the module corresponding to the "module" param.
2. Call function (or instantiate class) corresponding to the "main" param.
3. Send the param (in "args") into the function (or class) when calling ( or instantiating).

Args:
module_cfg (dict): config items, eg:
{
"module": "models.model",
"main": "Model",
"args": {...}
}

Returns:
the module loaded.
"""
module = importlib.import_module(module_cfg['module'])
return getattr(module, module_cfg['main'])(**module_cfg['args'])


@PIPELINES.register_module(
Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
class LinearAECPipeline(Pipeline):
r"""AEC Inference Pipeline only support 16000 sample rate.

When invoke the class with pipeline.__call__(), you should provide two params:
Dict[str, Any]
the path of wav files,eg:{
"nearend_mic": "/your/data/near_end_mic_audio.wav",
"farend_speech": "/your/data/far_end_speech_audio.wav"}
output_path (str, optional): "/your/output/audio_after_aec.wav"
the file path to write generate audio.
"""

def __init__(self, model):
r"""
Args:
model: model id on modelscope hub.
"""
super().__init__(model=model)
self.use_cuda = torch.cuda.is_available()
with open(
os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
self.config = yaml.full_load(f.read())
self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
self._init_model()
self.preprocessor = LinearAECAndFbank(self.config['io'])

n_fft = self.config['loss']['args']['n_fft']
hop_length = self.config['loss']['args']['hop_length']
winlen = n_fft
window = torch.hamming_window(winlen, periodic=False)

def stft(x):
return torch.stft(
x,
n_fft,
hop_length,
winlen,
center=False,
window=window.to(x.device),
return_complex=False)

def istft(x, slen):
return torch.istft(
x,
n_fft,
hop_length,
winlen,
window=window.to(x.device),
center=False,
length=slen)

self.stft = stft
self.istft = istft

def _init_model(self):
checkpoint = torch.load(
os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
map_location='cpu')
self.model = initialize_config(self.config['nnet'])
if self.use_cuda:
self.model = self.model.cuda()
self.model.load_state_dict(checkpoint)

def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
r"""The AEC process.

Args:
inputs: dict={'feature': Tensor, 'base': Tensor}
'feature' feature of input audio.
'base' the base audio to mask.

Returns:
dict:
{
'output_pcm': generated audio array
}
"""
output_data = self._process(inputs['feature'], inputs['base'])
return {'output_pcm': output_data}

def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
r"""The post process. Will save audio to file, if the output_path is given.

Args:
inputs: dict:
{
'output_pcm': generated audio array
}
kwargs: accept 'output_path' which is the path to write generated audio

Returns:
dict:
{
'output_pcm': generated audio array
}
"""
if 'output_path' in kwargs.keys():
wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
inputs['output_pcm'].astype(np.int16))
inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
return inputs

def _process(self, fbanks, mixture):
if self.use_cuda:
fbanks = fbanks.cuda()
mixture = mixture.cuda()
if self.model.vad:
with torch.no_grad():
masks, vad = self.model(fbanks.unsqueeze(0))
masks = masks.permute([2, 1, 0])
else:
with torch.no_grad():
masks = self.model(fbanks.unsqueeze(0))
masks = masks.permute([2, 1, 0])
spectrum = self.stft(mixture)
masked_spec = spectrum * masks
masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
return masked_sig

+ 29
- 1
modelscope/pipelines/base.py View File

@@ -12,10 +12,11 @@ from modelscope.pydatasets import PyDataset
from modelscope.utils.config import Config
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.logger import get_logger
from .outputs import TASK_OUTPUTS
from .util import is_model_name

Tensor = Union['torch.Tensor', 'tf.Tensor']
Input = Union[str, PyDataset, Dict, 'PIL.Image.Image', 'numpy.ndarray']
Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
InputModel = Union[str, Model]

output_keys = [
@@ -106,8 +107,25 @@ class Pipeline(ABC):
out = self.preprocess(input, **post_kwargs)
out = self.forward(out)
out = self.postprocess(out, **post_kwargs)
self._check_output(out)
return out

def _check_output(self, input):
# this attribute is dynamically attached by registry
# when cls is registered in registry using task name
task_name = self.group_key
if task_name not in TASK_OUTPUTS:
logger.warning(f'task {task_name} output keys are missing')
return
output_keys = TASK_OUTPUTS[task_name]
missing_keys = []
for k in output_keys:
if k not in input:
missing_keys.append(k)
if len(missing_keys) > 0:
raise ValueError(f'expected output keys are {output_keys}, '
f'those {missing_keys} are missing')

def preprocess(self, inputs: Input) -> Dict[str, Any]:
""" Provide default implementation based on preprocess_cfg and user can reimplement it
"""
@@ -125,4 +143,14 @@ class Pipeline(ABC):

@abstractmethod
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
""" If current pipeline support model reuse, common postprocess
code should be write here.

Args:
inputs: input data

Return:
dict of results: a dict containing outputs of model, each
output should have the standard output name.
"""
raise NotImplementedError('postprocess')

+ 10
- 7
modelscope/pipelines/builder.py View File

@@ -3,24 +3,27 @@
import os.path as osp
from typing import List, Union

import json
from maas_hub.file_download import model_file_download

from modelscope.models.base import Model
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.constant import CONFIGFILE, Tasks
from modelscope.utils.constant import Tasks
from modelscope.utils.registry import Registry, build_from_cfg
from .base import Pipeline
from .util import is_model_name

PIPELINES = Registry('pipelines')

DEFAULT_MODEL_FOR_PIPELINE = {
# TaskName: (pipeline_module_name, model_repo)
Tasks.image_matting: ('image-matting', 'damo/image-matting-person'),
Tasks.word_segmentation:
('structbert-chinese-word-segmentation',
'damo/nlp_structbert_word-segmentation_chinese-base'),
Tasks.sentence_similarity:
('sbert-base-chinese-sentence-similarity',
'damo/nlp_structbert_sentence-similarity_chinese-base'),
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
Tasks.text_classification:
('bert-sentiment-analysis', 'damo/bert-base-sst2'),
Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
Tasks.text_generation: ('palm2.0',
'damo/nlp_palm2.0_text-generation_chinese-base'),
Tasks.image_captioning: ('ofa', None),
Tasks.image_generation:
('person-image-cartoon',


+ 3
- 3
modelscope/pipelines/cv/image_matting_pipeline.py View File

@@ -1,5 +1,5 @@
import os.path as osp
from typing import Any, Dict, List, Tuple, Union
from typing import Any, Dict

import cv2
import numpy as np
@@ -7,7 +7,7 @@ import PIL

from modelscope.pipelines.base import Input
from modelscope.preprocessors import load_image
from modelscope.utils.constant import TF_GRAPH_FILE, Tasks
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from ..base import Pipeline
from ..builder import PIPELINES
@@ -24,7 +24,7 @@ class ImageMattingPipeline(Pipeline):
import tensorflow as tf
if tf.__version__ >= '2.0':
tf = tf.compat.v1
model_path = osp.join(self.model, TF_GRAPH_FILE)
model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)

config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True


+ 1
- 1
modelscope/pipelines/multi_modal/__init__.py View File

@@ -1 +1 @@
from .image_captioning import ImageCaptionPipeline
from .image_caption_pipeline import ImageCaptionPipeline

modelscope/pipelines/multi_modal/image_captioning.py → modelscope/pipelines/multi_modal/image_caption_pipeline.py View File

@@ -84,8 +84,11 @@ class ImageCaptionPipeline(Pipeline):
s = torch.cat([s, self.eos_item])
return s

patch_image = self.patch_resize_transform(
load_image(input)).unsqueeze(0)
if isinstance(input, Image.Image):
patch_image = self.patch_resize_transform(input).unsqueeze(0)
else:
patch_image = self.patch_resize_transform(
load_image(input)).unsqueeze(0)
patch_mask = torch.tensor([True])
text = 'what does the image describe?'
src_text = encode_text(

+ 2
- 0
modelscope/pipelines/nlp/__init__.py View File

@@ -1,4 +1,6 @@
from .sentence_similarity_pipeline import * # noqa F403
from .sequence_classification_pipeline import * # noqa F403
from .space.dialog_intent_prediction_pipeline import * # noqa F403
from .space.dialog_modeling_pipeline import * # noqa F403
from .text_generation_pipeline import * # noqa F403
from .word_segmentation_pipeline import * # noqa F403

+ 62
- 0
modelscope/pipelines/nlp/sentence_similarity_pipeline.py View File

@@ -0,0 +1,62 @@
from typing import Any, Dict, Union

import numpy as np

from modelscope.models.nlp import SbertForSentenceSimilarity
from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.utils.constant import Tasks
from ...models import Model
from ..base import Input, Pipeline
from ..builder import PIPELINES

__all__ = ['SentenceSimilarityPipeline']


@PIPELINES.register_module(
Tasks.sentence_similarity,
module_name=r'sbert-base-chinese-sentence-similarity')
class SentenceSimilarityPipeline(Pipeline):

def __init__(self,
model: Union[SbertForSentenceSimilarity, str],
preprocessor: SequenceClassificationPreprocessor = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction

Args:
model (SbertForSentenceSimilarity): a model instance
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
"""
assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
'model must be a single str or SbertForSentenceSimilarity'
sc_model = model if isinstance(
model,
SbertForSentenceSimilarity) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = SequenceClassificationPreprocessor(
sc_model.model_dir,
first_sequence='first_sequence',
second_sequence='second_sequence')
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

assert hasattr(self.model, 'id2label'), \
'id2label map should be initalizaed in init function.'

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
"""process the prediction results

Args:
inputs (Dict[str, Any]): _description_

Returns:
Dict[str, str]: the prediction results
"""

probs = inputs['probabilities'][0]
num_classes = probs.shape[0]
top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
probs = probs[cls_ids].tolist()
cls_names = [self.model.id2label[cid] for cid in cls_ids]
b = 0
return {'scores': probs[b], 'labels': cls_names[b]}

+ 16
- 40
modelscope/pipelines/nlp/sequence_classification_pipeline.py View File

@@ -1,8 +1,5 @@
import os
import uuid
from typing import Any, Dict, Union

import json
import numpy as np

from modelscope.models.nlp import BertForSequenceClassification
@@ -41,50 +38,29 @@ class SequenceClassificationPipeline(Pipeline):
second_sequence=None)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

from easynlp.utils import io
self.label_path = os.path.join(sc_model.model_dir,
'label_mapping.json')
with io.open(self.label_path) as f:
self.label_mapping = json.load(f)
self.label_id_to_name = {
idx: name
for name, idx in self.label_mapping.items()
}
assert hasattr(self.model, 'id2label'), \
'id2label map should be initalizaed in init function.'

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
def postprocess(self,
inputs: Dict[str, Any],
topk: int = 5) -> Dict[str, str]:
"""process the prediction results

Args:
inputs (Dict[str, Any]): _description_
inputs (Dict[str, Any]): input data dict
topk (int): return topk classification result.

Returns:
Dict[str, str]: the prediction results
"""
# NxC np.ndarray
probs = inputs['probs'][0]
num_classes = probs.shape[0]
topk = min(topk, num_classes)
top_indices = np.argpartition(probs, -topk)[-topk:]
cls_ids = top_indices[np.argsort(probs[top_indices])]
probs = probs[cls_ids].tolist()

probs = inputs['probabilities']
logits = inputs['logits']
predictions = np.argsort(-probs, axis=-1)
preds = predictions[0]
b = 0
new_result = list()
for pred in preds:
new_result.append({
'pred': self.label_id_to_name[pred],
'prob': float(probs[b][pred]),
'logit': float(logits[b][pred])
})
new_results = list()
new_results.append({
'id':
inputs['id'][b] if 'id' in inputs else str(uuid.uuid4()),
'output':
new_result,
'predictions':
new_result[0]['pred'],
'probabilities':
','.join([str(t) for t in inputs['probabilities'][b]]),
'logits':
','.join([str(t) for t in inputs['logits'][b]])
})
cls_names = [self.model.id2label[cid] for cid in cls_ids]

return new_results[0]
return {'scores': probs, 'labels': cls_names}

+ 23
- 20
modelscope/pipelines/nlp/text_generation_pipeline.py View File

@@ -1,7 +1,7 @@
from typing import Dict, Optional, Union

from modelscope.models import Model
from modelscope.models.nlp import PalmForTextGenerationModel
from modelscope.models.nlp import PalmForTextGeneration
from modelscope.preprocessors import TextGenerationPreprocessor
from modelscope.utils.constant import Tasks
from ..base import Pipeline, Tensor
@@ -10,11 +10,11 @@ from ..builder import PIPELINES
__all__ = ['TextGenerationPipeline']


@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
class TextGenerationPipeline(Pipeline):

def __init__(self,
model: Union[PalmForTextGenerationModel, str],
model: Union[PalmForTextGeneration, str],
preprocessor: Optional[TextGenerationPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline):
model (SequenceClassificationModel): a model instance
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
"""
sc_model = model if isinstance(
model,
PalmForTextGenerationModel) else Model.from_pretrained(model)
model = model if isinstance(
model, PalmForTextGeneration) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = TextGenerationPreprocessor(
sc_model.model_dir,
model.model_dir,
model.tokenizer,
first_sequence='sentence',
second_sequence=None)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
self.tokenizer = preprocessor.tokenizer
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = model.tokenizer

def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
"""process the prediction results
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
Returns:
Dict[str, str]: the prediction results
"""
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
''),
('<s>', ''), ('</s>', ''), ('<unk>', ' '))

vocab_size = len(self.tokenizer.vocab)
pred_list = inputs['predictions']
pred_ids = pred_list[0][0].cpu().numpy().tolist()
for j in range(len(pred_ids)):
if pred_ids[j] >= vocab_size:
pred_ids[j] = 100
pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
pred_string = ''.join(pred).replace(
'##',
'').split('[SEP]')[0].replace('[CLS]',
'').replace('[SEP]',
'').replace('[UNK]', '')
return {'pred_string': pred_string}
pred_string = self.tokenizer.decode(pred_ids)
for _old, _new in replace_tokens_bert:
pred_string = pred_string.replace(_old, _new)
pred_string.strip()
for _old, _new in replace_tokens_roberta:
pred_string = pred_string.replace(_old, _new)
pred_string.strip()
return {'text': pred_string}

+ 69
- 0
modelscope/pipelines/nlp/word_segmentation_pipeline.py View File

@@ -0,0 +1,69 @@
from typing import Any, Dict, Optional, Union

from modelscope.models import Model
from modelscope.models.nlp import StructBertForTokenClassification
from modelscope.preprocessors import TokenClassifcationPreprocessor
from modelscope.utils.constant import Tasks
from ..base import Pipeline, Tensor
from ..builder import PIPELINES

__all__ = ['WordSegmentationPipeline']


@PIPELINES.register_module(
Tasks.word_segmentation,
module_name=r'structbert-chinese-word-segmentation')
class WordSegmentationPipeline(Pipeline):

def __init__(self,
model: Union[StructBertForTokenClassification, str],
preprocessor: Optional[TokenClassifcationPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction

Args:
model (StructBertForTokenClassification): a model instance
preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
"""
model = model if isinstance(
model,
StructBertForTokenClassification) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = TokenClassifcationPreprocessor(model.model_dir)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = preprocessor.tokenizer
self.config = model.config
self.id2label = self.config.id2label

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
"""process the prediction results

Args:
inputs (Dict[str, Any]): _description_

Returns:
Dict[str, str]: the prediction results
"""

pred_list = inputs['predictions']
labels = []
for pre in pred_list:
labels.append(self.id2label[pre])
labels = labels[1:-1]
chunks = []
chunk = ''
assert len(inputs['text']) == len(labels)
for token, label in zip(inputs['text'], labels):
if label[0] == 'B' or label[0] == 'I':
chunk += token
else:
chunk += token
chunks.append(chunk)
chunk = ''
if chunk:
chunks.append(chunk)
seg_result = ' '.join(chunks)
rst = {
'output': seg_result,
}
return rst

+ 117
- 0
modelscope/pipelines/outputs.py View File

@@ -0,0 +1,117 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from modelscope.utils.constant import Tasks

TASK_OUTPUTS = {

# ============ vision tasks ===================

# image classification result for single sample
# {
# "labels": ["dog", "horse", "cow", "cat"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.image_classification: ['scores', 'labels'],
Tasks.image_tagging: ['scores', 'labels'],

# object detection result for single sample
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "labels": ["dog", "horse", "cow", "cat"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.object_detection: ['scores', 'labels', 'boxes'],

# instance segmentation result for single sample
# {
# "masks": [
# np.array in bgr channel order
# ],
# "labels": ["dog", "horse", "cow", "cat"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.image_segmentation: ['scores', 'labels', 'boxes'],

# image generation/editing/matting result for single sample
# {
# "output_png": np.array with shape(h, w, 4)
# for matting or (h, w, 3) for general purpose
# }
Tasks.image_editing: ['output_png'],
Tasks.image_matting: ['output_png'],
Tasks.image_generation: ['output_png'],

# pose estimation result for single sample
# {
# "poses": np.array with shape [num_pose, num_keypoint, 3],
# each keypoint is a array [x, y, score]
# "boxes": np.array with shape [num_pose, 4], each box is
# [x1, y1, x2, y2]
# }
Tasks.pose_estimation: ['poses', 'boxes'],

# ============ nlp tasks ===================

# text classification result for single sample
# {
# "labels": ["happy", "sad", "calm", "angry"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.text_classification: ['scores', 'labels'],

# text generation result for single sample
# {
# "text": "this is text generated by a model."
# }
Tasks.text_generation: ['text'],

# word segmentation result for single sample
# {
# "output": "今天 天气 不错 , 适合 出去 游玩"
# }
Tasks.word_segmentation: ['output'],

# sentence similarity result for single sample
# {
# "labels": "1",
# "scores": 0.9
# }
Tasks.sentence_similarity: ['scores', 'labels'],

# ============ audio tasks ===================

# audio processed for single file in PCM format
# {
# "output_pcm": np.array with shape(samples,) and dtype float32
# }
Tasks.speech_signal_process: ['output_pcm'],

# ============ multi-modal tasks ===================

# image caption result for single sample
# {
# "caption": "this is an image caption text."
# }
Tasks.image_captioning: ['caption'],

# visual grounding result for single sample
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.visual_grounding: ['boxes', 'scores'],

# text_to_image result for a single sample
# {
# "image": np.ndarray with shape [height, width, 3]
# }
Tasks.text_to_image_synthesis: ['image']
}

+ 21
- 17
modelscope/pipelines/util.py View File

@@ -1,12 +1,23 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import os.path as osp
from typing import List, Union

import json
from maas_hub.file_download import model_file_download

from modelscope.utils.constant import CONFIGFILE
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile
from modelscope.utils.logger import get_logger

logger = get_logger()


def is_config_has_model(cfg_file):
try:
cfg = Config.from_file(cfg_file)
return hasattr(cfg, 'model')
except Exception as e:
logger.error(f'parse config file {cfg_file} failed: {e}')
return False


def is_model_name(model: Union[str, List]):
@@ -15,24 +26,17 @@ def is_model_name(model: Union[str, List]):

def is_model_name_impl(model):
if osp.exists(model):
if osp.exists(osp.join(model, CONFIGFILE)):
return True
cfg_file = osp.join(model, ModelFile.CONFIGURATION)
if osp.exists(cfg_file):
return is_config_has_model(cfg_file)
else:
return False
else:
# try:
# cfg_file = model_file_download(model, CONFIGFILE)
# except Exception:
# cfg_file = None
# TODO @wenmeng.zwm use exception instead of
# following tricky logic
cfg_file = model_file_download(model, CONFIGFILE)
with open(cfg_file, 'r') as infile:
cfg = json.load(infile)
if 'Code' in cfg:
try:
cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
return is_config_has_model(cfg_file)
except Exception:
return False
else:
return True

if isinstance(model, str):
return is_model_name_impl(model)


+ 1
- 1
modelscope/preprocessors/__init__.py View File

@@ -1,10 +1,10 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from .audio import LinearAECAndFbank
from .base import Preprocessor
from .builder import PREPROCESSORS, build_preprocessor
from .common import Compose
from .image import LoadImage, load_image
from .nlp import * # noqa F403
from .nlp import TextGenerationPreprocessor
from .space.dialog_intent_prediction_preprocessor import * # noqa F403
from .space.dialog_modeling_preprocessor import * # noqa F403

+ 230
- 0
modelscope/preprocessors/audio.py View File

@@ -0,0 +1,230 @@
import ctypes
import os
from typing import Any, Dict

import numpy as np
import scipy.io.wavfile as wav
import torch
import torchaudio.compliance.kaldi as kaldi
from numpy.ctypeslib import ndpointer

from modelscope.utils.constant import Fields
from .builder import PREPROCESSORS


def load_wav(path):
samp_rate, data = wav.read(path)
return np.float32(data), samp_rate


def load_library(libaec):
libaec_in_cwd = os.path.join('.', libaec)
if os.path.exists(libaec_in_cwd):
libaec = libaec_in_cwd
mitaec = ctypes.cdll.LoadLibrary(libaec)
fe_process = mitaec.fe_process_inst
fe_process.argtypes = [
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
]
return fe_process


def do_linear_aec(fe_process, mic, ref, int16range=True):
mic = np.float32(mic)
ref = np.float32(ref)
if len(mic) > len(ref):
mic = mic[:len(ref)]
out_mic = np.zeros_like(mic)
out_linear = np.zeros_like(mic)
out_echo = np.zeros_like(mic)
out_ref = np.zeros_like(mic)
if int16range:
mic /= 32768
ref /= 32768
fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
# out_ref not in use here
if int16range:
out_mic *= 32768
out_linear *= 32768
out_echo *= 32768
return out_mic, out_ref, out_linear, out_echo


def load_kaldi_feature_transform(filename):
fp = open(filename, 'r')
all_str = fp.read()
pos1 = all_str.find('AddShift')
pos2 = all_str.find('[', pos1)
pos3 = all_str.find(']', pos2)
mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
pos1 = all_str.find('Rescale')
pos2 = all_str.find('[', pos1)
pos3 = all_str.find(']', pos2)
scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
fp.close()
return mean, scale


class Feature:
r"""Extract feat from one utterance.
"""

def __init__(self,
fbank_config,
feat_type='spec',
mvn_file=None,
cuda=False):
r"""

Args:
fbank_config (dict):
feat_type (str):
raw: do nothing
fbank: use kaldi.fbank
spec: Real/Imag
logpow: log(1+|x|^2)
mvn_file (str): the path of data file for mean variance normalization
cuda:
"""
self.fbank_config = fbank_config
self.feat_type = feat_type
self.n_fft = fbank_config['frame_length'] * fbank_config[
'sample_frequency'] // 1000
self.hop_length = fbank_config['frame_shift'] * fbank_config[
'sample_frequency'] // 1000
self.window = torch.hamming_window(self.n_fft, periodic=False)

self.mvn = False
if mvn_file is not None and os.path.exists(mvn_file):
print(f'loading mvn file: {mvn_file}')
shift, scale = load_kaldi_feature_transform(mvn_file)
self.shift = torch.from_numpy(shift)
self.scale = torch.from_numpy(scale)
self.mvn = True
if cuda:
self.window = self.window.cuda()
if self.mvn:
self.shift = self.shift.cuda()
self.scale = self.scale.cuda()

def compute(self, utt):
r"""

Args:
utt: in [-32768, 32767] range

Returns:
[..., T, F]
"""
if self.feat_type == 'raw':
return utt
elif self.feat_type == 'fbank':
if len(utt.shape) == 1:
utt = utt.unsqueeze(0)
feat = kaldi.fbank(utt, **self.fbank_config)
elif self.feat_type == 'spec':
spec = torch.stft(
utt / 32768,
self.n_fft,
self.hop_length,
self.n_fft,
self.window,
center=False,
return_complex=True)
feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
elif self.feat_type == 'logpow':
spec = torch.stft(
utt,
self.n_fft,
self.hop_length,
self.n_fft,
self.window,
center=False,
return_complex=True)
abspow = torch.abs(spec)**2
feat = torch.log(1 + abspow).permute(-1, -2)
return feat

def normalize(self, feat):
if self.mvn:
feat = feat + self.shift
feat = feat * self.scale
return feat


@PREPROCESSORS.register_module(Fields.audio)
class LinearAECAndFbank:
SAMPLE_RATE = 16000

def __init__(self, io_config):
self.trunc_length = 7200 * self.SAMPLE_RATE
self.linear_aec_delay = io_config['linear_aec_delay']
self.feature = Feature(io_config['fbank_config'],
io_config['feat_type'], io_config['mvn'])
self.mitaec = load_library(io_config['mitaec_library'])
self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'

def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
""" linear filtering the near end mic and far end audio, then extract the feature
:param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
:return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature"
"""
# read files
nearend_mic, fs = load_wav(data['nearend_mic'])
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
farend_speech, fs = load_wav(data['farend_speech'])
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
if 'nearend_speech' in data:
nearend_speech, fs = load_wav(data['nearend_speech'])
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
else:
nearend_speech = np.zeros_like(nearend_mic)

out_mic, out_ref, out_linear, out_echo = do_linear_aec(
self.mitaec, nearend_mic, farend_speech)
# fix 20ms linear aec delay by delaying the target speech
extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
nearend_speech = np.concatenate([extra_zeros, nearend_speech])
# truncate files to the same length
flen = min(
len(out_mic), len(out_ref), len(out_linear), len(out_echo),
len(nearend_speech))
fstart = 0
flen = min(flen, self.trunc_length)
nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
out_mic[fstart:flen], out_ref[fstart:flen],
out_linear[fstart:flen], out_echo[fstart:flen],
nearend_speech[fstart:flen])

# extract features (frames, [mic, linear, ref, aes?])
feat = torch.FloatTensor()

nearend_mic = torch.from_numpy(np.float32(nearend_mic))
fbank_nearend_mic = self.feature.compute(nearend_mic)
feat = torch.cat([feat, fbank_nearend_mic], dim=1)

out_linear = torch.from_numpy(np.float32(out_linear))
fbank_out_linear = self.feature.compute(out_linear)
feat = torch.cat([feat, fbank_out_linear], dim=1)

out_echo = torch.from_numpy(np.float32(out_echo))
fbank_out_echo = self.feature.compute(out_echo)
feat = torch.cat([feat, fbank_out_echo], dim=1)

# feature transform
feat = self.feature.normalize(feat)

# prepare target
if nearend_speech is not None:
nearend_speech = torch.from_numpy(np.float32(nearend_speech))

if self.mask_on_mic:
base = nearend_mic
else:
base = out_linear
out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
return out_data

+ 1
- 1
modelscope/preprocessors/image.py View File

@@ -9,7 +9,7 @@ from modelscope.utils.constant import Fields
from .builder import PREPROCESSORS


@PREPROCESSORS.register_module(Fields.image)
@PREPROCESSORS.register_module(Fields.cv)
class LoadImage:
"""Load an image from file or url.
Added or updated keys are "filename", "img", "img_shape",


+ 82
- 16
modelscope/preprocessors/nlp.py View File

@@ -11,8 +11,8 @@ from .base import Preprocessor
from .builder import PREPROCESSORS

__all__ = [
'Tokenize',
'SequenceClassificationPreprocessor',
'Tokenize', 'SequenceClassificationPreprocessor',
'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
]


@@ -31,7 +31,7 @@ class Tokenize(Preprocessor):


@PREPROCESSORS.register_module(
Fields.nlp, module_name=r'bert-sentiment-analysis')
Fields.nlp, module_name=r'bert-sequence-classification')
class SequenceClassificationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
@@ -51,21 +51,42 @@ class SequenceClassificationPreprocessor(Preprocessor):
self.sequence_length = kwargs.pop('sequence_length', 128)

self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
print(f'this is the tokenzier {self.tokenizer}')

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
@type_assert(object, (str, tuple))
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
"""process the raw input data

Args:
data (str): a sentence
Example:
'you are so handsome.'
data (str or tuple):
sentence1 (str): a sentence
Example:
'you are so handsome.'
or
(sentence1, sentence2)
sentence1 (str): a sentence
Example:
'you are so handsome.'
sentence2 (str): a sentence
Example:
'you are so beautiful.'

Returns:
Dict[str, Any]: the preprocessed data
"""

new_data = {self.first_sequence: data}
if not isinstance(data, tuple):
data = (
data,
None,
)

sentence1, sentence2 = data
new_data = {
self.first_sequence: sentence1,
self.second_sequence: sentence2
}

# preprocess the data for the model input

rst = {
@@ -94,17 +115,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
return rst


@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
class TextGenerationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
"""preprocess the data using the vocab.txt from the `model_dir` path

Args:
model_dir (str): model path
"""
from sofa import PalmTokenizer

super().__init__(*args, **kwargs)

self.model_dir: str = model_dir
@@ -113,7 +132,7 @@ class TextGenerationPreprocessor(Preprocessor):
self.second_sequence: str = kwargs.pop('second_sequence',
'second_sequence')
self.sequence_length: int = kwargs.pop('sequence_length', 128)
self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
self.tokenizer = tokenizer

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
@@ -132,7 +151,7 @@ class TextGenerationPreprocessor(Preprocessor):
new_data = {self.first_sequence: data}
# preprocess the data for the model input

rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
rst = {'input_ids': [], 'attention_mask': []}

max_seq_length = self.sequence_length

@@ -147,6 +166,53 @@ class TextGenerationPreprocessor(Preprocessor):

rst['input_ids'].append(feature['input_ids'])
rst['attention_mask'].append(feature['attention_mask'])
rst['token_type_ids'].append(feature['token_type_ids'])

return {k: torch.tensor(v) for k, v in rst.items()}


@PREPROCESSORS.register_module(
Fields.nlp, module_name=r'bert-token-classification')
class TokenClassifcationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path

Args:
model_dir (str): model path
"""

super().__init__(*args, **kwargs)

from sofa import SbertTokenizer
self.model_dir: str = model_dir
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
"""process the raw input data

Args:
data (str): a sentence
Example:
'you are so handsome.'

Returns:
Dict[str, Any]: the preprocessed data
"""
# preprocess the data for the model input

text = data.replace(' ', '').strip()
tokens = []
for token in text:
token = self.tokenizer.tokenize(token)
tokens.extend(token)
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
attention_mask = [1] * len(input_ids)
token_type_ids = [0] * len(input_ids)
return {
'text': text,
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids
}

+ 6
- 6
modelscope/utils/config.py View File

@@ -74,17 +74,17 @@ class Config:
{'c': [1, 2, 3], 'd': 'dd'}
>>> cfg.b.d
'dd'
>>> cfg = Config.from_file('configs/examples/config.json')
>>> cfg = Config.from_file('configs/examples/configuration.json')
>>> cfg.filename
'configs/examples/config.json'
'configs/examples/configuration.json'
>>> cfg.b
{'c': [1, 2, 3], 'd': 'dd'}
>>> cfg = Config.from_file('configs/examples/config.py')
>>> cfg = Config.from_file('configs/examples/configuration.py')
>>> cfg.filename
"configs/examples/config.py"
>>> cfg = Config.from_file('configs/examples/config.yaml')
"configs/examples/configuration.py"
>>> cfg = Config.from_file('configs/examples/configuration.yaml')
>>> cfg.filename
"configs/examples/config.yaml"
"configs/examples/configuration.yaml"
"""

@staticmethod


+ 15
- 13
modelscope/utils/constant.py View File

@@ -4,8 +4,8 @@
class Fields(object):
""" Names for different application fields
"""
image = 'image'
video = 'video'
# image = 'image'
# video = 'video'
cv = 'cv'
nlp = 'nlp'
audio = 'audio'
@@ -30,7 +30,9 @@ class Tasks(object):
image_matting = 'image-matting'

# nlp tasks
word_segmentation = 'word-segmentation'
sentiment_analysis = 'sentiment-analysis'
sentence_similarity = 'sentence-similarity'
text_classification = 'text-classification'
relation_extraction = 'relation-extraction'
zero_shot = 'zero-shot'
@@ -52,7 +54,7 @@ class Tasks(object):
text_to_speech = 'text-to-speech'
speech_signal_process = 'speech-signal-process'

# multi-media
# multi-modal tasks
image_captioning = 'image-captioning'
visual_grounding = 'visual-grounding'
text_to_image_synthesis = 'text-to-image-synthesis'
@@ -73,16 +75,16 @@ class Hubs(object):
huggingface = 'huggingface'


# configuration filename
# in order to avoid conflict with huggingface
# config file we use maas_config instead
CONFIGFILE = 'maas_config.json'
class ModelFile(object):
CONFIGURATION = 'configuration.json'
README = 'README.md'
TF_SAVED_MODEL_FILE = 'saved_model.pb'
TF_GRAPH_FILE = 'tf_graph.pb'
TF_CHECKPOINT_FOLDER = 'tf_ckpts'
TF_CKPT_PREFIX = 'ckpt-'
TORCH_MODEL_FILE = 'pytorch_model.pt'
TORCH_MODEL_BIN_FILE = 'pytorch_model.bin'


README_FILE = 'README.md'
TF_SAVED_MODEL_FILE = 'saved_model.pb'
TF_GRAPH_FILE = 'tf_graph.pb'
TF_CHECKPOINT_FOLDER = 'tf_ckpts'
TF_CHECKPOINT_FILE = 'checkpoint'
TORCH_MODEL_FILE = 'pytorch_model.bin'
TENSORFLOW = 'tensorflow'
PYTORCH = 'pytorch'

+ 1
- 1
modelscope/utils/registry.py View File

@@ -1,7 +1,6 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import inspect
from email.policy import default

from modelscope.utils.logger import get_logger

@@ -70,6 +69,7 @@ class Registry(object):
f'{self._name}[{group_key}]')

self._modules[group_key][module_name] = module_cls
module_cls.group_key = group_key

if module_name in self._modules[default_group]:
if id(self._modules[default_group][module_name]) == id(module_cls):


+ 20
- 0
modelscope/utils/test_utils.py View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python
# Copyright (c) Alibaba, Inc. and its affiliates.

import os

TEST_LEVEL = 2
TEST_LEVEL_STR = 'TEST_LEVEL'


def test_level():
global TEST_LEVEL
if TEST_LEVEL_STR in os.environ:
TEST_LEVEL = int(os.environ[TEST_LEVEL_STR])

return TEST_LEVEL


def set_test_level(level: int):
global TEST_LEVEL
TEST_LEVEL = level

+ 1
- 0
requirements/docs.txt View File

@@ -1,6 +1,7 @@
docutils==0.16.0
recommonmark
sphinx==4.0.2
sphinx-book-theme
sphinx-copybutton
sphinx_markdown_tables
sphinx_rtd_theme==0.5.2

+ 1
- 1
requirements/nlp.txt View File

@@ -1 +1 @@
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl

+ 3
- 2
requirements/runtime.txt View File

@@ -1,12 +1,13 @@
addict
datasets
easydict
https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
numpy
opencv-python-headless
Pillow
Pillow>=6.2.0
pyyaml
requests
scipy
tokenizers<=0.10.3
transformers<=4.16.2
yapf

+ 2
- 1
setup.cfg View File

@@ -11,6 +11,7 @@ default_section = THIRDPARTY
BASED_ON_STYLE = pep8
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
SPLIT_BEFORE_ARITHMETIC_OPERATOR = true

[codespell]
skip = *.ipynb
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
[flake8]
select = B,C,E,F,P,T4,W,B9
max-line-length = 120
ignore = F401,F821
ignore = F401,F821,W503
exclude = docs/src,*.pyi,.git

+ 8
- 11
tests/pipelines/test_base.py View File

@@ -35,9 +35,10 @@ class CustomPipelineTest(unittest.TestCase):
CustomPipeline1()

def test_custom(self):
dummy_task = 'dummy-task'

@PIPELINES.register_module(
group_key=Tasks.image_tagging, module_name='custom-image')
group_key=dummy_task, module_name='custom-image')
class CustomImagePipeline(Pipeline):

def __init__(self,
@@ -67,32 +68,28 @@ class CustomPipelineTest(unittest.TestCase):
outputs['filename'] = inputs['url']
img = inputs['img']
new_image = img.resize((img.width // 2, img.height // 2))
outputs['resize_image'] = np.array(new_image)
outputs['dummy_result'] = 'dummy_result'
outputs['output_png'] = np.array(new_image)
return outputs

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs

self.assertTrue('custom-image' in PIPELINES.modules[default_group])
add_default_pipeline_info(Tasks.image_tagging, 'custom-image')
add_default_pipeline_info(dummy_task, 'custom-image', overwrite=True)
pipe = pipeline(pipeline_name='custom-image')
pipe2 = pipeline(Tasks.image_tagging)
pipe2 = pipeline(dummy_task)
self.assertTrue(type(pipe) is type(pipe2))

img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \
'aliyuncs.com/data/test/images/image1.jpg'
img_url = 'data/test/images/image1.jpg'
output = pipe(img_url)
self.assertEqual(output['filename'], img_url)
self.assertEqual(output['resize_image'].shape, (318, 512, 3))
self.assertEqual(output['dummy_result'], 'dummy_result')
self.assertEqual(output['output_png'].shape, (318, 512, 3))

outputs = pipe([img_url for i in range(4)])
self.assertEqual(len(outputs), 4)
for out in outputs:
self.assertEqual(out['filename'], img_url)
self.assertEqual(out['resize_image'].shape, (318, 512, 3))
self.assertEqual(out['dummy_result'], 'dummy_result')
self.assertEqual(out['output_png'].shape, (318, 512, 3))


if __name__ == '__main__':


+ 3
- 4
tests/pipelines/test_image_captioning.py View File

@@ -7,11 +7,12 @@ import unittest
from modelscope.fileio import File
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level


class ImageCaptionTest(unittest.TestCase):

@unittest.skip('skip long test')
@unittest.skip('skip before model is restored in model hub')
def test_run(self):
model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt'

@@ -26,9 +27,7 @@ class ImageCaptionTest(unittest.TestCase):
img_captioning = pipeline(
Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir)

result = img_captioning(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_captioning('data/test/images/image_matting.png')
print(result['caption'])




+ 11
- 15
tests/pipelines/test_image_matting.py View File

@@ -9,14 +9,15 @@ import cv2
from modelscope.fileio import File
from modelscope.pipelines import pipeline
from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Tasks
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level


class ImageMattingTest(unittest.TestCase):

def setUp(self) -> None:
self.model_id = 'damo/cv_unet_image-matting_damo'
self.model_id = 'damo/cv_unet_image-matting'
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
@@ -28,20 +29,17 @@ class ImageMattingTest(unittest.TestCase):
model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \
'.com/data/test/maas/image_matting/matting_person.pb'
with tempfile.TemporaryDirectory() as tmp_dir:
model_file = osp.join(tmp_dir, 'matting_person.pb')
model_file = osp.join(tmp_dir, ModelFile.TF_GRAPH_FILE)
with open(model_file, 'wb') as ofile:
ofile.write(File.read(model_path))
img_matting = pipeline(Tasks.image_matting, model=tmp_dir)

result = img_matting(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_matting('data/test/images/image_matting.png')
cv2.imwrite('result.png', result['output_png'])

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_dataset(self):
input_location = [
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
]
input_location = ['data/test/images/image_matting.png']
# alternatively:
# input_location = '/dir/to/images'

@@ -52,21 +50,19 @@ class ImageMattingTest(unittest.TestCase):
cv2.imwrite('result.png', next(result)['output_png'])
print(f'Output written to {osp.abspath("result.png")}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_modelhub(self):
img_matting = pipeline(Tasks.image_matting, model=self.model_id)

result = img_matting(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_matting('data/test/images/image_matting.png')
cv2.imwrite('result.png', result['output_png'])
print(f'Output written to {osp.abspath("result.png")}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_modelhub_default_model(self):
img_matting = pipeline(Tasks.image_matting)

result = img_matting(
'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
)
result = img_matting('data/test/images/image_matting.png')
cv2.imwrite('result.png', result['output_png'])
print(f'Output written to {osp.abspath("result.png")}')



+ 3
- 0
tests/pipelines/test_person_image_cartoon.py View File

@@ -8,6 +8,7 @@ import cv2
from modelscope.pipelines import pipeline
from modelscope.pipelines.base import Pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level


class ImageCartoonTest(unittest.TestCase):
@@ -36,10 +37,12 @@ class ImageCartoonTest(unittest.TestCase):
img_cartoon = pipeline(Tasks.image_generation, model=model_dir)
self.pipeline_inference(img_cartoon, self.test_image)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_modelhub(self):
img_cartoon = pipeline(Tasks.image_generation, model=self.model_id)
self.pipeline_inference(img_cartoon, self.test_image)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_modelhub_default_model(self):
img_cartoon = pipeline(Tasks.image_generation)
self.pipeline_inference(img_cartoon, self.test_image)


+ 67
- 0
tests/pipelines/test_sentence_similarity.py View File

@@ -0,0 +1,67 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import shutil
import unittest

from maas_hub.snapshot_download import snapshot_download

from modelscope.models import Model
from modelscope.models.nlp import SbertForSentenceSimilarity
from modelscope.pipelines import SentenceSimilarityPipeline, pipeline
from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level


class SentenceSimilarityTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
sentence1 = '今天气温比昨天高么?'
sentence2 = '今天湿度比昨天高么?'

def setUp(self) -> None:
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
shutil.rmtree(
get_model_cache_dir(self.model_id), ignore_errors=True)

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run(self):
cache_path = snapshot_download(self.model_id)
tokenizer = SequenceClassificationPreprocessor(cache_path)
model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
print('test1')
print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
print()
print(
f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = SequenceClassificationPreprocessor(model.model_dir)
pipeline_ins = pipeline(
task=Tasks.sentence_similarity,
model=model,
preprocessor=tokenizer)
print(pipeline_ins(input=(self.sentence1, self.sentence2)))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.sentence_similarity, model=self.model_id)
print(pipeline_ins(input=(self.sentence1, self.sentence2)))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.sentence_similarity)
print(pipeline_ins(input=(self.sentence1, self.sentence2)))


if __name__ == '__main__':
unittest.main()

+ 56
- 0
tests/pipelines/test_speech_signal_process.py View File

@@ -0,0 +1,56 @@
import os.path
import shutil
import unittest

from modelscope.fileio import File
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.hub import get_model_cache_dir

NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
NEAREND_MIC_FILE = 'nearend_mic.wav'
FAREND_SPEECH_FILE = 'farend_speech.wav'

AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Flib%2Flibmitaec_pyio.so' \
'?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D'
AEC_LIB_FILE = 'libmitaec_pyio.so'


def download(remote_path, local_path):
local_dir = os.path.dirname(local_path)
if len(local_dir) > 0:
if not os.path.exists(local_dir):
os.makedirs(local_dir)
with open(local_path, 'wb') as ofile:
ofile.write(File.read(remote_path))


class SpeechSignalProcessTest(unittest.TestCase):

def setUp(self) -> None:
self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
shutil.rmtree(
get_model_cache_dir(self.model_id), ignore_errors=True)
# A temporary hack to provide c++ lib. Download it first.
download(AEC_LIB_URL, AEC_LIB_FILE)

def test_run(self):
download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
input = {
'nearend_mic': NEAREND_MIC_FILE,
'farend_speech': FAREND_SPEECH_FILE
}
aec = pipeline(
Tasks.speech_signal_process,
model=self.model_id,
pipeline_name=r'speech_dfsmn_aec_psm_16k')
aec(input, output_path='output.wav')


if __name__ == '__main__':
unittest.main()

+ 6
- 0
tests/pipelines/test_text_classification.py View File

@@ -12,6 +12,7 @@ from modelscope.preprocessors import SequenceClassificationPreprocessor
from modelscope.pydatasets import PyDataset
from modelscope.utils.constant import Hubs, Tasks
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level


class SequenceClassificationTest(unittest.TestCase):
@@ -43,6 +44,7 @@ class SequenceClassificationTest(unittest.TestCase):
break
print(r)

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run(self):
model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \
'/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip'
@@ -67,6 +69,7 @@ class SequenceClassificationTest(unittest.TestCase):
Tasks.text_classification, model=model, preprocessor=preprocessor)
print(pipeline2('Hello world!'))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
preprocessor = SequenceClassificationPreprocessor(
@@ -77,6 +80,7 @@ class SequenceClassificationTest(unittest.TestCase):
preprocessor=preprocessor)
self.predict(pipeline_ins)

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
text_classification = pipeline(
task=Tasks.text_classification, model=self.model_id)
@@ -85,6 +89,7 @@ class SequenceClassificationTest(unittest.TestCase):
'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
self.printDataset(result)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self):
text_classification = pipeline(task=Tasks.text_classification)
result = text_classification(
@@ -92,6 +97,7 @@ class SequenceClassificationTest(unittest.TestCase):
'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
self.printDataset(result)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_dataset(self):
model = Model.from_pretrained(self.model_id)
preprocessor = SequenceClassificationPreprocessor(


+ 54
- 26
tests/pipelines/test_text_generation.py View File

@@ -4,47 +4,75 @@ import unittest
from maas_hub.snapshot_download import snapshot_download

from modelscope.models import Model
from modelscope.models.nlp import PalmForTextGenerationModel
from modelscope.models.nlp import PalmForTextGeneration
from modelscope.pipelines import TextGenerationPipeline, pipeline
from modelscope.preprocessors import TextGenerationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level


class TextGenerationTest(unittest.TestCase):
model_id = 'damo/nlp_palm_text-generation_chinese'
input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'"
input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'"
model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
model_id_en = 'damo/nlp_palm2.0_text-generation_english-base'
input_zh = """
本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方:
1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代
"""
input_en = """
The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started
her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders ,
54 , sparked outrage last week when she decided the 86-year-old should not face astring of charges
of paedophilia against nine children because he has dementia . Today , newly-released documents
revealed damning evidence that abuse was covered up by police andsocial workers for more than 20 years .
And now it has emerged Mrs Saunders ' law career got off to a flying start when she secured her
pupillage -- a barrister 's training contract at 1 Garden Court Chambers in London in 1983 .
"""

@unittest.skip('skip temporarily to save test time')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run(self):
cache_path = snapshot_download(self.model_id)
preprocessor = TextGenerationPreprocessor(
cache_path, first_sequence='sentence', second_sequence=None)
model = PalmForTextGenerationModel(
cache_path, tokenizer=preprocessor.tokenizer)
pipeline1 = TextGenerationPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.text_generation, model=model, preprocessor=preprocessor)
print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}')
print()
print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}')
for model_id, input in ((self.model_id_zh, self.input_zh),
(self.model_id_en, self.input_en)):
cache_path = snapshot_download(model_id)
model = PalmForTextGeneration(cache_path)
preprocessor = TextGenerationPreprocessor(
cache_path,
model.tokenizer,
first_sequence='sentence',
second_sequence=None)
pipeline1 = TextGenerationPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.text_generation, model=model, preprocessor=preprocessor)
print(
f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
)

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
preprocessor = TextGenerationPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
pipeline_ins = pipeline(
task=Tasks.text_generation, model=model, preprocessor=preprocessor)
print(pipeline_ins(self.input1))
for model_id, input in ((self.model_id_zh, self.input_zh),
(self.model_id_en, self.input_en)):
model = Model.from_pretrained(model_id)
preprocessor = TextGenerationPreprocessor(
model.model_dir,
model.tokenizer,
first_sequence='sentence',
second_sequence=None)
pipeline_ins = pipeline(
task=Tasks.text_generation,
model=model,
preprocessor=preprocessor)
print(pipeline_ins(input))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.text_generation, model=self.model_id)
print(pipeline_ins(self.input2))
for model_id, input in ((self.model_id_zh, self.input_zh),
(self.model_id_en, self.input_en)):
pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
print(pipeline_ins(input))

@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.text_generation)
print(pipeline_ins(self.input2))
print(pipeline_ins(self.input_zh))


if __name__ == '__main__':


+ 62
- 0
tests/pipelines/test_word_segmentation.py View File

@@ -0,0 +1,62 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import shutil
import unittest

from maas_hub.snapshot_download import snapshot_download

from modelscope.models import Model
from modelscope.models.nlp import StructBertForTokenClassification
from modelscope.pipelines import WordSegmentationPipeline, pipeline
from modelscope.preprocessors import TokenClassifcationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.hub import get_model_cache_dir
from modelscope.utils.test_utils import test_level


class WordSegmentationTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
sentence = '今天天气不错,适合出去游玩'

def setUp(self) -> None:
# switch to False if downloading everytime is not desired
purge_cache = True
if purge_cache:
shutil.rmtree(
get_model_cache_dir(self.model_id), ignore_errors=True)

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_by_direct_model_download(self):
cache_path = snapshot_download(self.model_id)
tokenizer = TokenClassifcationPreprocessor(cache_path)
model = StructBertForTokenClassification(
cache_path, tokenizer=tokenizer)
pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = TokenClassifcationPreprocessor(model.model_dir)
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.word_segmentation, model=self.model_id)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.word_segmentation)
print(pipeline_ins(input=self.sentence))


if __name__ == '__main__':
unittest.main()

+ 20
- 0
tests/preprocessors/test_image.py View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import unittest

import PIL

from modelscope.preprocessors import load_image
from modelscope.utils.logger import get_logger


class ImagePreprocessorTest(unittest.TestCase):

def test_load(self):
img = load_image('data/test/images/image_matting.png')
self.assertTrue(isinstance(img, PIL.Image.Image))
self.assertEqual(img.size, (948, 533))


if __name__ == '__main__':
unittest.main()

+ 9
- 0
tests/run.py View File

@@ -7,6 +7,11 @@ import sys
import unittest
from fnmatch import fnmatch

from modelscope.utils.logger import get_logger
from modelscope.utils.test_utils import set_test_level, test_level

logger = get_logger()


def gather_test_cases(test_dir, pattern, list_tests):
case_list = []
@@ -49,5 +54,9 @@ if __name__ == '__main__':
'--pattern', default='test_*.py', help='test file pattern')
parser.add_argument(
'--test_dir', default='tests', help='directory to be tested')
parser.add_argument(
'--level', default=0, help='2 -- all, 1 -- p1, 0 -- p0')
args = parser.parse_args()
set_test_level(args.level)
logger.info(f'TEST LEVEL: {test_level()}')
main(args)

+ 5
- 8
tests/utils/test_config.py View File

@@ -1,11 +1,8 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import argparse
import os.path as osp
import tempfile
import unittest
from pathlib import Path

from modelscope.fileio import dump, load
from modelscope.utils.config import Config

obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -14,25 +11,25 @@ obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
class ConfigTest(unittest.TestCase):

def test_json(self):
config_file = 'configs/examples/config.json'
config_file = 'configs/examples/configuration.json'
cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b'])

def test_yaml(self):
config_file = 'configs/examples/config.yaml'
config_file = 'configs/examples/configuration.yaml'
cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b'])

def test_py(self):
config_file = 'configs/examples/config.py'
config_file = 'configs/examples/configuration.py'
cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b'])

def test_dump(self):
config_file = 'configs/examples/config.py'
config_file = 'configs/examples/configuration.py'
cfg = Config.from_file(config_file)
self.assertEqual(cfg.a, 1)
self.assertEqual(cfg.b, obj['b'])
@@ -53,7 +50,7 @@ class ConfigTest(unittest.TestCase):
self.assertEqual(yaml_str, infile.read())

def test_to_dict(self):
config_file = 'configs/examples/config.json'
config_file = 'configs/examples/configuration.json'
cfg = Config.from_file(config_file)
d = cfg.to_dict()
print(d)


Loading…
Cancel
Save