space intent and modeling(generation) are ready

3 years ago · d0b33eade8
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
 *.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -104,7 +104,6 @@ venv.bak/
 # mypy
 .mypy_cache/

 data
 .vscode
 .idea

--- a/Makefile.docker
+++ b/Makefile.docker
@@ -0,0 +1,67 @@
 DOCKER_REGISTRY           = registry.cn-shanghai.aliyuncs.com
 DOCKER_ORG                = modelscope
 DOCKER_IMAGE              = modelscope
 DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)

 # CUDA_VERSION              = 11.3
 # CUDNN_VERSION             = 8
 BASE_RUNTIME              = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
 # BASE_DEVEL                = reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
 BASE_DEVEL                = pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel


 MODELSCOPE_VERSION           = $(shell git describe --tags --always)

 # Can be either official / dev
 BUILD_TYPE                = dev
 BUILD_PROGRESS            = auto
 BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE)

 EXTRA_DOCKER_BUILD_FLAGS ?= --network=host
 # DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
 # 							docker build \
 # 								--progress=$(BUILD_PROGRESS) \
 # 								$(EXTRA_DOCKER_BUILD_FLAGS) \
 # 								--target $(BUILD_TYPE) \
 # 								-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
 # 								$(BUILD_ARGS) \
 #								-f docker/pytorch.dockerfile .
 DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
 							docker build \
 								$(EXTRA_DOCKER_BUILD_FLAGS) \
 								-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
 								$(BUILD_ARGS)  \
 								-f docker/pytorch.dockerfile .
 DOCKER_PUSH               = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG)

 .PHONY: all
 all: devel-image

 .PHONY: devel-image
 devel-image: BASE_IMAGE := $(BASE_DEVEL)
 devel-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
 devel-image:
 	$(DOCKER_BUILD)

 .PHONY: devel-push
 devel-push: BASE_IMAGE := $(BASE_DEVEL)
 devel-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-devel
 devel-push:
 	$(DOCKER_PUSH)

 .PHONY: runtime-image
 runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
 runtime-image: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
 runtime-image:
 	$(DOCKER_BUILD)
 	docker tag $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(DOCKER_FULL_NAME):latest

 .PHONY: runtime-push
 runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
 runtime-push: DOCKER_TAG := $(MODELSCOPE_VERSION)-runtime
 runtime-push:
 	$(DOCKER_PUSH)

 .PHONY: clean
 clean:
 	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
--- a/configs/examples/configuration.json
+++ b/configs/examples/configuration.json
--- a/configs/examples/configuration.py
+++ b/configs/examples/configuration.py
--- a/configs/examples/configuration.yaml
+++ b/configs/examples/configuration.yaml
--- a/data/test/images/image1.jpg
+++ b/data/test/images/image1.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d
 size 129862
--- a/data/test/images/image_matting.png
+++ b/data/test/images/image_matting.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
 size 603621
--- a/docker/.dockerignore
+++ b/docker/.dockerignore
@@ -0,0 +1,4 @@
 *.sh
 *.md
 *.dockerfile
 *.zip
--- a/docker/pytorch.dockerfile
+++ b/docker/pytorch.dockerfile
@@ -0,0 +1,53 @@
 # syntax = docker/dockerfile:experimental
 #
 # NOTE: To build this you will need a docker version > 18.06 with
 #       experimental enabled and DOCKER_BUILDKIT=1
 #
 #       If you do not use buildkit you are not going to have a good time
 #
 #       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements/

 # ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
 # FROM ${BASE_IMAGE} as dev-base

 # FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base
 FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
 # FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
 # config pip source
 RUN mkdir /root/.pip
 COPY docker/rcfiles/pip.conf.tsinghua  /root/.pip/pip.conf
 COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list

 # Install essential Ubuntu packages
 RUN apt-get update &&\
    apt-get install -y software-properties-common \
    build-essential \
    git \
    wget \
    vim \
    curl \
    zip \
    zlib1g-dev \
    unzip \
    pkg-config

 # install modelscope and its python env
 WORKDIR /opt/modelscope
 COPY . .
 RUN pip install -r requirements.txt
 # RUN --mount=type=cache,target=/opt/ccache \
 #     python setup.py install

 # opencv-python-headless conflict with opencv-python installed
 RUN python setup.py install \
    && pip uninstall -y opencv-python-headless

 # prepare modelscope libs
 COPY docker/scripts/install_libs.sh /tmp/
 RUN bash /tmp/install_libs.sh && \
    rm -rf /tmp/install_libs.sh

 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64

 WORKDIR /workspace
--- a/docker/rcfiles/pip.conf.tsinghua
+++ b/docker/rcfiles/pip.conf.tsinghua
@@ -0,0 +1,2 @@
 [global]
 index-url=https://pypi.tuna.tsinghua.edu.cn/simple
--- a/docker/rcfiles/sources.list.aliyun
+++ b/docker/rcfiles/sources.list.aliyun
@@ -0,0 +1,25 @@
 deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted

 deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted

 deb http://mirrors.aliyun.com/ubuntu/ bionic universe
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic universe
 deb http://mirrors.aliyun.com/ubuntu/ bionic-updates universe
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates universe

 deb http://mirrors.aliyun.com/ubuntu/ bionic multiverse
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic multiverse
 deb http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates multiverse

 deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
 # deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse

 deb http://mirrors.aliyun.com/ubuntu bionic-security main restricted
 # deb-src http://mirrors.aliyun.com/ubuntu bionic-security main restricted
 deb http://mirrors.aliyun.com/ubuntu bionic-security universe
 # deb-src http://mirrors.aliyun.com/ubuntu bionic-security universe
 deb http://mirrors.aliyun.com/ubuntu bionic-security multiverse
 # deb-src http://mirrors.aliyun.com/ubuntu bionic-security multiverse
--- a/docker/rcfiles/user.vimrc
+++ b/docker/rcfiles/user.vimrc
@@ -0,0 +1,10 @@
 set nocompatible
 set encoding=utf-8
 set hlsearch
 set smartindent
 set ruler
 set number
 set ts=2
 set sw=2
 set expandtab
 autocmd FileType make setlocal noexpandtab
--- a/docker/scripts/install_libs.sh
+++ b/docker/scripts/install_libs.sh
@@ -0,0 +1,12 @@
 #!/bin/bash

 set -eo pipefail

 ModelScopeLib=/usr/local/modelscope/lib64

 if [ ! -d /usr/local/modelscope ]; then
    mkdir -p $ModelScopeLib
 fi

 # audio libs
 wget "http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/release/maas/libs/audio/libmitaec_pyio.so" -O ${ModelScopeLib}/libmitaec_pyio.so
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -76,7 +76,7 @@ exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
 html_theme = 'sphinx_book_theme'
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 html_theme_options = {}

--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -34,13 +34,111 @@ make linter
 ```

 ## 2. Test
 ### 2.1 Unit test

 ### 2.1 Test level

 There are mainly three test levels:

 * level 0: tests for basic interface and function of framework, such as `tests/trainers/test_trainer_base.py`
 * level 1: important functional test which test end2end workflow, such as `tests/pipelines/test_image_matting.py`
 * level 2: scenario tests for all the implemented modules such as model, pipeline in different algorithm filed.

 Default test level is 0, which will only run those cases of level 0, you can set test level
 via environment variable `TEST_LEVEL`. For more details, you can refer to [test-doc](https://alidocs.dingtalk.com/i/nodes/mdvQnONayjBJKLXy1Bp38PY2MeXzp5o0?dontjump=true&nav=spaces&navQuery=spaceId%3Dnb9XJNlZxbgrOXyA)


 ```bash
 # run all tests
 TEST_LEVEL=2 make test

 # run important functional tests
 TEST_LEVEL=1 make test

 # run core UT and basic functional tests
 make test
 ```

 ### 2.2 Test data
 TODO
 When writing test cases, you should assign a test level for your test case using
 following code. If left default, the test level will be 0, it will run in each
 test stage.

 File test_module.py
 ```python
 from modelscope.utils.test_utils import test_level

 class ImageCartoonTest(unittest.TestCase):
    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_by_direct_model_download(self):
        pass
 ```

 ### 2.2 Run tests

 1. Run your own single test case to test your self-implemented function. You can run your
 test file directly, if it fails to run, pls check if variable `TEST_LEVEL`
 exists in the environment and unset it.
 ```bash
 python tests/path/to/your_test.py
 ```

 2. Remember to run core tests in local environment before start a codereview, by default it will
 only run test cases with level 0.
 ```bash
 make tests
 ```

 3. After you start a code review, ci tests will be triggered which will run test cases with level 1

 4. Daily regression tests will run all cases at 0 am each day using master branch.

 ### 2.3 Test data storage

 As we need a lot of data for testing, including images, videos, models. We use git lfs
 to store those large files.

 1. install git-lfs
 for mac
 ```bash
 brew install git-lfs
 git lfs install
 ```

 for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0)
 ```bash
 wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm
 sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm
 git lfs install
 ```

 for ubuntu
 ```bash
 curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
 sudo apt-get install git-lfs
 git lfs install
 ```

 2. track your data type using git lfs, for example, to track png files
 ```bash
 git lfs track "*.png"
 ```

 3. add your test files to `data/test/` folder, you can make directories if you need.
 ```bash
 git add data/test/test.png
 ```

 4. commit your test data to remote branch
 ```bash
 git commit -m "xxx"
 ```

 To pull data from remote repo, just as the same way you pull git files.
 ```bash
 git pull origin branch_name
 ```




 ## Code Review

@@ -93,3 +191,22 @@ TODO
 ```bash
 make whl
 ```

 ## Build docker

 build develop docker
 ```bash
 sudo make -f Makefile.docker devel-image
 ```

 push develop docker, passwd pls ask wenmeng.zwm
 ```bash
 sudo docker login --username=mass_test@test.aliyunid.com registry.cn-shanghai.aliyuncs.com
 Password:
 sudo make -f Makefile.docker devel-push
 ```

 To build runtime image, just replace `devel` with `runtime` in the upper commands.
 ```bash
 udo make -f Makefile.docker runtime-image runtime-push
 ```
--- a/modelscope/models/init.py
+++ b/modelscope/models/init.py
@@ -2,4 +2,4 @@

 from .base import Model
 from .builder import MODELS, build_model
 from .nlp import BertForSequenceClassification
 from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
--- a/modelscope/models/audio/init.py
+++ b/modelscope/models/audio/init.py
--- a/modelscope/models/audio/layers/init.py
+++ b/modelscope/models/audio/layers/init.py
--- a/modelscope/models/audio/layers/activations.py
+++ b/modelscope/models/audio/layers/activations.py
@@ -0,0 +1,60 @@
 import torch.nn as nn

 from .layer_base import LayerBase


 class RectifiedLinear(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(RectifiedLinear, self).__init__()
        self.dim = input_dim
        self.relu = nn.ReLU()

    def forward(self, input):
        return self.relu(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
        return re_str

    def load_kaldi_nnet(self, instr):
        return instr


 class LogSoftmax(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(LogSoftmax, self).__init__()
        self.dim = input_dim
        self.ls = nn.LogSoftmax()

    def forward(self, input):
        return self.ls(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Softmax> %d %d\n' % (self.dim, self.dim)
        return re_str

    def load_kaldi_nnet(self, instr):
        return instr


 class Sigmoid(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(Sigmoid, self).__init__()
        self.dim = input_dim
        self.sig = nn.Sigmoid()

    def forward(self, input):
        return self.sig(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim)
        return re_str

    def load_kaldi_nnet(self, instr):
        return instr
--- a/modelscope/models/audio/layers/affine_transform.py
+++ b/modelscope/models/audio/layers/affine_transform.py
@@ -0,0 +1,78 @@
 import numpy as np
 import torch as th
 import torch.nn as nn

 from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
                         to_kaldi_matrix)


 class AffineTransform(LayerBase):

    def __init__(self, input_dim, output_dim):
        super(AffineTransform, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, input):
        return self.linear(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
                                                 self.input_dim)
        re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        return re_str

    def to_raw_nnet(self, fid):
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        x.tofile(fid)

        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        x.tofile(fid)

    def load_kaldi_nnet(self, instr):
        output = expect_token_number(
            instr,
            '<LearnRateCoef>',
        )
        if output is None:
            raise Exception('AffineTransform format error for <LearnRateCoef>')
        instr, lr = output

        output = expect_token_number(instr, '<BiasLearnRateCoef>')
        if output is None:
            raise Exception(
                'AffineTransform format error for <BiasLearnRateCoef>')
        instr, lr = output

        output = expect_token_number(instr, '<MaxNorm>')
        if output is None:
            raise Exception('AffineTransform format error for <MaxNorm>')
        instr, lr = output

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('AffineTransform format error for parsing matrix')
        instr, mat = output

        print(mat.shape)
        self.linear.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('AffineTransform format error for parsing matrix')
        instr, mat = output
        mat = np.squeeze(mat)
        self.linear.bias = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))
        return instr
--- a/modelscope/models/audio/layers/deep_fsmn.py
+++ b/modelscope/models/audio/layers/deep_fsmn.py
@@ -0,0 +1,178 @@
 import numpy as np
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F

 from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
                         to_kaldi_matrix)


 class DeepFsmn(LayerBase):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 rorder=None,
                 hidden_size=None,
                 layer_norm=False,
                 dropout=0):
        super(DeepFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        if lorder is None:
            return

        self.lorder = lorder
        self.rorder = rorder
        self.hidden_size = hidden_size
        self.layer_norm = layer_norm

        self.linear = nn.Linear(input_dim, hidden_size)
        self.norm = nn.LayerNorm(hidden_size)
        self.drop1 = nn.Dropout(p=dropout)
        self.drop2 = nn.Dropout(p=dropout)
        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.conv1 = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1], [1, 1],
            groups=output_dim,
            bias=False)
        self.conv2 = nn.Conv2d(
            output_dim,
            output_dim, [rorder, 1], [1, 1],
            groups=output_dim,
            bias=False)

    def forward(self, input):

        f1 = F.relu(self.linear(input))

        f1 = self.drop1(f1)
        if self.layer_norm:
            f1 = self.norm(f1)

        p1 = self.project(f1)

        x = th.unsqueeze(p1, 1)

        x_per = x.permute(0, 3, 2, 1)

        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
        yr = F.pad(x_per, [0, 0, 0, self.rorder])
        yr = yr[:, :, 1:, :]

        out = x_per + self.conv1(y) + self.conv2(yr)
        out = self.drop2(out)

        out1 = out.permute(0, 3, 2, 1)

        return input + out1.squeeze()

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<UniDeepFsmn> %d %d\n'\
                  % (self.output_dim, self.input_dim)
        re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\
                  % (1, self.hidden_size, self.lorder, 1)
        lfiters = self.state_dict()['conv1.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        re_str += to_kaldi_matrix(x)
        proj_weights = self.state_dict()['project.weight']
        x = proj_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        return re_str

    def load_kaldi_nnet(self, instr):
        output = expect_token_number(
            instr,
            '<LearnRateCoef>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
        instr, lr = output

        output = expect_token_number(
            instr,
            '<HidSize>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <HidSize>')
        instr, hiddensize = output
        self.hidden_size = int(hiddensize)

        output = expect_token_number(
            instr,
            '<LOrder>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LOrder>')
        instr, lorder = output
        self.lorder = int(lorder)

        output = expect_token_number(
            instr,
            '<LStride>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LStride>')
        instr, lstride = output
        self.lstride = lstride

        output = expect_token_number(
            instr,
            '<MaxNorm>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <MaxNorm>')

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        mat1 = np.fliplr(mat.T).copy()
        self.conv1 = nn.Conv2d(
            self.output_dim,
            self.output_dim, [self.lorder, 1], [1, 1],
            groups=self.output_dim,
            bias=False)
        mat_th = th.from_numpy(mat1).type(th.FloatTensor)
        mat_th = mat_th.unsqueeze(1)
        mat_th = mat_th.unsqueeze(3)
        self.conv1.weight = th.nn.Parameter(mat_th)

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output

        self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
        self.linear = nn.Linear(self.input_dim, self.hidden_size)

        self.project.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        self.linear.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        self.linear.bias = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        return instr
--- a/modelscope/models/audio/layers/layer_base.py
+++ b/modelscope/models/audio/layers/layer_base.py
@@ -0,0 +1,50 @@
 import abc
 import re

 import numpy as np
 import torch.nn as nn


 def expect_token_number(instr, token):
    first_token = re.match(r'^\s*' + token, instr)
    if first_token is None:
        return None
    instr = instr[first_token.end():]
    lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr)
    if lr is None:
        return None
    return instr[lr.end():], lr.groups()[0]


 def expect_kaldi_matrix(instr):
    pos2 = instr.find('[', 0)
    pos3 = instr.find(']', pos2)
    mat = []
    for stt in instr[pos2 + 1:pos3].split('\n'):
        tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ')
        if tmp_mat.size > 0:
            mat.append(tmp_mat)
    return instr[pos3 + 1:], np.array(mat)


 def to_kaldi_matrix(np_mat):
    """
    function that transform as str numpy mat to standard kaldi str matrix
    :param np_mat: numpy mat
    :return: str
    """
    np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True)
    out_str = str(np_mat)
    out_str = out_str.replace('[', '')
    out_str = out_str.replace(']', '')
    return '[ %s ]\n' % out_str


 class LayerBase(nn.Module, metaclass=abc.ABCMeta):

    def __init__(self):
        super(LayerBase, self).__init__()

    @abc.abstractmethod
    def to_kaldi_nnet(self):
        pass
--- a/modelscope/models/audio/layers/uni_deep_fsmn.py
+++ b/modelscope/models/audio/layers/uni_deep_fsmn.py
@@ -0,0 +1,482 @@
 import numpy as np
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F

 from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number,
                         to_kaldi_matrix)


 class SepConv(nn.Module):

    def __init__(self,
                 in_channels,
                 filters,
                 out_channels,
                 kernel_size=(5, 2),
                 dilation=(1, 1)):
        """ :param kernel_size (time, frequency)

        """
        super(SepConv, self).__init__()
        # depthwise + pointwise
        self.dconv = nn.Conv2d(
            in_channels,
            in_channels * filters,
            kernel_size,
            dilation=dilation,
            groups=in_channels)
        self.pconv = nn.Conv2d(
            in_channels * filters, out_channels, kernel_size=1)
        self.padding = dilation[0] * (kernel_size[0] - 1)

    def forward(self, input):
        ''' input: [B, C, T, F]
        '''
        x = F.pad(input, [0, 0, self.padding, 0])
        x = self.dconv(x)
        x = self.pconv(x)
        return x


 class Conv2d(nn.Module):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=20,
                 rorder=0,
                 groups=1,
                 bias=False,
                 skip_connect=True):
        super(Conv2d, self).__init__()
        self.lorder = lorder
        self.conv = nn.Conv2d(
            input_dim, output_dim, [lorder, 1], groups=groups, bias=bias)
        self.rorder = rorder
        if self.rorder:
            self.conv2 = nn.Conv2d(
                input_dim, output_dim, [rorder, 1], groups=groups, bias=bias)
        self.skip_connect = skip_connect

    def forward(self, input):
        # [B, 1, T, F]
        x = th.unsqueeze(input, 1)
        # [B, F, T, 1]
        x_per = x.permute(0, 3, 2, 1)
        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
        out = self.conv(y)
        if self.rorder:
            yr = F.pad(x_per, [0, 0, 0, self.rorder])
            yr = yr[:, :, 1:, :]
            out += self.conv2(yr)
        out = out.permute(0, 3, 2, 1).squeeze(1)
        if self.skip_connect:
            out = out + input
        return out


 class SelfAttLayer(nn.Module):

    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
        super(SelfAttLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        if lorder is None:
            return

        self.lorder = lorder
        self.hidden_size = hidden_size

        self.linear = nn.Linear(input_dim, hidden_size)

        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.att = nn.Linear(input_dim, lorder, bias=False)

    def forward(self, input):

        f1 = F.relu(self.linear(input))

        p1 = self.project(f1)

        x = th.unsqueeze(p1, 1)

        x_per = x.permute(0, 3, 2, 1)

        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])

        # z [B, F, T, lorder]
        z = x_per
        for i in range(1, self.lorder):
            z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1)

        # [B, T, lorder]
        att = F.softmax(self.att(input), dim=-1)
        att = th.unsqueeze(att, 1)
        z = th.sum(z * att, axis=-1)

        out1 = z.permute(0, 2, 1)

        return input + out1


 class TFFsmn(nn.Module):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 hidden_size=None,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 skip_connect=True):
        super(TFFsmn, self).__init__()

        self.skip_connect = skip_connect

        self.linear = nn.Linear(input_dim, hidden_size)
        self.norm = nn.Identity()
        if layer_norm:
            self.norm = nn.LayerNorm(input_dim)
        self.act = nn.ReLU()
        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.conv1 = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1],
            dilation=[dilation, 1],
            groups=output_dim,
            bias=False)
        self.padding_left = dilation * (lorder - 1)
        dorder = 5
        self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False)
        self.padding_freq = dorder - 1

    def forward(self, input):
        return self.compute1(input)

    def compute1(self, input):
        ''' linear-dconv-relu(norm)-linear-dconv
        '''
        x = self.linear(input)
        # [B, 1, F, T]
        x = th.unsqueeze(x, 1).permute(0, 1, 3, 2)
        z = F.pad(x, [0, 0, self.padding_freq, 0])
        z = self.conv2(z) + x
        x = z.permute(0, 3, 2, 1).squeeze(-1)
        x = self.act(x)
        x = self.norm(x)
        x = self.project(x)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        # [B, F, T+lorder-1, 1]
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()

        return input + out


 class CNNFsmn(nn.Module):
    ''' use cnn to reduce parameters
    '''

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 hidden_size=None,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 skip_connect=True):
        super(CNNFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.skip_connect = skip_connect

        if lorder is None:
            return

        self.lorder = lorder
        self.hidden_size = hidden_size

        self.linear = nn.Linear(input_dim, hidden_size)
        self.act = nn.ReLU()
        kernel_size = (3, 8)
        stride = (1, 4)
        self.conv = nn.Sequential(
            nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0),
            nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride))

        self.dconv = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1],
            dilation=[dilation, 1],
            groups=output_dim,
            bias=False)
        self.padding_left = dilation * (lorder - 1)

    def forward(self, input):
        return self.compute2(input)

    def compute1(self, input):
        ''' linear-relu(norm)-conv2d-relu?-dconv
        '''
        # [B, T, F]
        x = self.linear(input)
        x = self.act(x)
        x = th.unsqueeze(x, 1)
        x = self.conv(x)
        # [B, C, T, F] -> [B, 1, T, F]
        b, c, t, f = x.shape
        x = x.view([b, 1, t, -1])
        x = x.permute(0, 3, 2, 1)
        # [B, F, T+lorder-1, 1]
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.dconv(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        return input + out

    def compute2(self, input):
        ''' conv2d-relu-linear-relu?-dconv
        '''
        x = th.unsqueeze(input, 1)
        x = self.conv(x)
        x = self.act(x)
        # [B, C, T, F] -> [B, T, F]
        b, c, t, f = x.shape
        x = x.view([b, t, -1])
        x = self.linear(x)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.dconv(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        return input + out


 class UniDeepFsmn(LayerBase):

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 hidden_size=None,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 skip_connect=True):
        super(UniDeepFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.skip_connect = skip_connect

        if lorder is None:
            return

        self.lorder = lorder
        self.hidden_size = hidden_size

        self.linear = nn.Linear(input_dim, hidden_size)
        self.norm = nn.Identity()
        if layer_norm:
            self.norm = nn.LayerNorm(input_dim)
        self.act = nn.ReLU()
        self.project = nn.Linear(hidden_size, output_dim, bias=False)

        self.conv1 = nn.Conv2d(
            output_dim,
            output_dim, [lorder, 1],
            dilation=[dilation, 1],
            groups=output_dim,
            bias=False)
        self.padding_left = dilation * (lorder - 1)

    def forward(self, input):
        return self.compute1(input)

    def compute1(self, input):
        ''' linear-relu(norm)-linear-dconv
        '''
        # [B, T, F]
        x = self.linear(input)
        x = self.act(x)
        x = self.norm(x)
        x = self.project(x)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        # [B, F, T+lorder-1, 1]
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()

        return input + out

    def compute2(self, input):
        ''' linear-dconv-linear-relu(norm)
        '''
        x = self.project(input)
        x = th.unsqueeze(x, 1).permute(0, 3, 2, 1)
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        x = self.linear(out)
        x = self.act(x)
        x = self.norm(x)

        return input + x

    def compute3(self, input):
        ''' dconv-linear-relu(norm)-linear
        '''
        x = th.unsqueeze(input, 1).permute(0, 3, 2, 1)
        y = F.pad(x, [0, 0, self.padding_left, 0])
        out = self.conv1(y)
        if self.skip_connect:
            out = out + x
        out = out.permute(0, 3, 2, 1).squeeze()
        x = self.linear(out)
        x = self.act(x)
        x = self.norm(x)
        x = self.project(x)

        return input + x

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<UniDeepFsmn> %d %d\n' \
                  % (self.output_dim, self.input_dim)
        re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \
                  % (1, self.hidden_size, self.lorder, 1)
        lfiters = self.state_dict()['conv1.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        re_str += to_kaldi_matrix(x)
        proj_weights = self.state_dict()['project.weight']
        x = proj_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        return re_str

    def to_raw_nnet(self, fid):
        lfiters = self.state_dict()['conv1.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        x.tofile(fid)

        proj_weights = self.state_dict()['project.weight']
        x = proj_weights.squeeze().numpy()
        x.tofile(fid)

        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        x.tofile(fid)

        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        x.tofile(fid)

    def load_kaldi_nnet(self, instr):
        output = expect_token_number(
            instr,
            '<LearnRateCoef>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LearnRateCoef>')
        instr, lr = output

        output = expect_token_number(
            instr,
            '<HidSize>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <HidSize>')
        instr, hiddensize = output
        self.hidden_size = int(hiddensize)

        output = expect_token_number(
            instr,
            '<LOrder>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LOrder>')
        instr, lorder = output
        self.lorder = int(lorder)

        output = expect_token_number(
            instr,
            '<LStride>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <LStride>')
        instr, lstride = output
        self.lstride = lstride

        output = expect_token_number(
            instr,
            '<MaxNorm>',
        )
        if output is None:
            raise Exception('UniDeepFsmn format error for <MaxNorm>')

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        mat1 = np.fliplr(mat.T).copy()

        self.conv1 = nn.Conv2d(
            self.output_dim,
            self.output_dim, [self.lorder, 1], [1, 1],
            groups=self.output_dim,
            bias=False)

        mat_th = th.from_numpy(mat1).type(th.FloatTensor)
        mat_th = mat_th.unsqueeze(1)
        mat_th = mat_th.unsqueeze(3)
        self.conv1.weight = th.nn.Parameter(mat_th)

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output

        self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False)
        self.linear = nn.Linear(self.input_dim, self.hidden_size)

        self.project.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        self.linear.weight = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        output = expect_kaldi_matrix(instr)
        if output is None:
            raise Exception('UniDeepFsmn format error for parsing matrix')
        instr, mat = output
        mat = np.squeeze(mat)
        self.linear.bias = th.nn.Parameter(
            th.from_numpy(mat).type(th.FloatTensor))

        return instr
--- a/modelscope/models/audio/network/init.py
+++ b/modelscope/models/audio/network/init.py
--- a/modelscope/models/audio/network/loss.py
+++ b/modelscope/models/audio/network/loss.py
@@ -0,0 +1,394 @@
 import torch
 import torch.nn.functional as F

 from .modulation_loss import (GaborSTRFConv, MelScale,
                              ModulationDomainLossModule)

 EPS = 1e-8


 def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1):
    '''
        stft: (batch, ..., 2) or complex(batch, ...)
        y = x + n
    '''
    if torch.is_complex(mixed_spec):
        yr, yi = mixed_spec.real, mixed_spec.imag
    else:
        yr, yi = mixed_spec[..., 0], mixed_spec[..., 1]
    if torch.is_complex(clean_spec):
        xr, xi = clean_spec.real, clean_spec.imag
    else:
        xr, xi = clean_spec[..., 0], clean_spec[..., 1]

    if mask_type == 'iam':
        ymag = torch.sqrt(yr**2 + yi**2)
        xmag = torch.sqrt(xr**2 + xi**2)
        iam = xmag / (ymag + EPS)
        return torch.clamp(iam, 0, 1)

    elif mask_type == 'psm':
        ypow = yr**2 + yi**2
        psm = (xr * yr + xi * yi) / (ypow + EPS)
        return torch.clamp(psm, 0, 1)

    elif mask_type == 'psmiam':
        ypow = yr**2 + yi**2
        psm = (xr * yr + xi * yi) / (ypow + EPS)
        ymag = torch.sqrt(yr**2 + yi**2)
        xmag = torch.sqrt(xr**2 + xi**2)
        iam = xmag / (ymag + EPS)
        psmiam = psm * iam
        return torch.clamp(psmiam, 0, 1)

    elif mask_type == 'crm':
        ypow = yr**2 + yi**2
        mr = (xr * yr + xi * yi) / (ypow + EPS)
        mi = (xi * yr - xr * yi) / (ypow + EPS)
        mr = torch.clamp(mr, -clip, clip)
        mi = torch.clamp(mi, -clip, clip)
        return mr, mi


 def energy_vad(spec,
               thdhigh=320 * 600 * 600 * 2,
               thdlow=320 * 300 * 300 * 2,
               int16=True):
    '''
        energy based vad should be accurate enough
        spec: (batch, bins, frames, 2)
        returns (batch, frames)
    '''
    energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1)
    vad = energy > thdhigh
    idx = torch.logical_and(vad == 0, energy > thdlow)
    vad[idx] = 0.5
    return vad


 def modulation_loss_init(n_fft):
    gabor_strf_parameters = torch.load(
        './network/gabor_strf_parameters.pt')['state_dict']
    gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60)
    gabor_modulation_kernels.load_state_dict(gabor_strf_parameters)

    modulation_loss_module = ModulationDomainLossModule(
        gabor_modulation_kernels.eval())
    for param in modulation_loss_module.parameters():
        param.requires_grad = False

    stft2mel = MelScale(
        n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda()

    return modulation_loss_module, stft2mel


 def mask_loss_function(
        loss_func='psm_loss',
        loss_type='mse',  # ['mse', 'mae', 'comb']
        mask_type='psmiam',
        use_mod_loss=False,
        use_wav2vec_loss=False,
        n_fft=640,
        hop_length=320,
        EPS=1e-8,
        weight=None):
    if weight is not None:
        print(f'Use loss weight: {weight}')
    winlen = n_fft
    window = torch.hamming_window(winlen, periodic=False)

    def stft(x, return_complex=False):
        # returns [batch, bins, frames, 2]
        return torch.stft(
            x,
            n_fft,
            hop_length,
            winlen,
            window=window.to(x.device),
            center=False,
            return_complex=return_complex)

    def istft(x, slen):
        return torch.istft(
            x,
            n_fft,
            hop_length,
            winlen,
            window=window.to(x.device),
            center=False,
            length=slen)

    def mask_loss(targets, masks, nframes):
        ''' [Batch, Time, Frequency]
        '''
        with torch.no_grad():
            mask_for_loss = torch.ones_like(targets)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks = masks * mask_for_loss
        targets = targets * mask_for_loss

        if weight is None:
            alpha = 1
        else:  # for aec ST
            alpha = weight - targets

        if loss_type == 'mse':
            loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2))
        elif loss_type == 'mae':
            loss = torch.sum(alpha * torch.abs(targets - masks))
        else:  # mse(mask), mae(mask) approx 1:2
            loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)
                                   + 0.1 * alpha * torch.abs(targets - masks))
        loss /= torch.sum(nframes)
        return loss

    def spectrum_loss(targets, spec, nframes):
        ''' [Batch, Time, Frequency, 2]
        '''
        with torch.no_grad():
            mask_for_loss = torch.ones_like(targets[..., 0])
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        xr = spec[..., 0] * mask_for_loss
        xi = spec[..., 1] * mask_for_loss
        yr = targets[..., 0] * mask_for_loss
        yi = targets[..., 1] * mask_for_loss
        xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss
        ymag = torch.sqrt(targets[..., 0]**2
                          + targets[..., 1]**2) * mask_for_loss

        loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2))
        loss2 = torch.sum(torch.pow(xmag - ymag, 2))

        loss = (loss1 + loss2) / torch.sum(nframes)
        return loss

    def sa_loss_dlen(mixed, clean, masks, nframes):
        yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768
        xspec = stft(clean).permute([0, 2, 1, 3]) / 32768
        with torch.no_grad():
            mask_for_loss = torch.ones_like(xspec[..., 0])
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3)
        xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15
        emag = emag * mask_for_loss
        xmag = xmag * mask_for_loss

        loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes)
        return loss

    def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed)
        clean_spec = stft(clean)
        targets = compute_mask(mixed_spec, clean_spec, mask_type)
        # [B, T, F]
        targets = targets.permute(0, 2, 1)

        loss = mask_loss(targets, masks, nframes)

        if subtask is not None:
            vadtargets = energy_vad(clean_spec)
            with torch.no_grad():
                mask_for_loss = torch.ones_like(targets[:, :, 0])
                for idx, num in enumerate(nframes):
                    mask_for_loss[idx, num:] = 0
            subtask = subtask[:, :, 0] * mask_for_loss
            vadtargets = vadtargets * mask_for_loss

            loss_vad = F.binary_cross_entropy(subtask, vadtargets)
            return loss + loss_vad
        return loss

    def modulation_loss(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed, True)
        clean_spec = stft(clean, True)
        enhanced_mag = torch.abs(mixed_spec)
        clean_mag = torch.abs(clean_spec)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(clean_mag)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, :, num:] = 0
        clean_mag = clean_mag * mask_for_loss
        enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1])

        # Covert to log-mel representation
        # (B,T,#mel_channels)
        clean_log_mel = torch.log(
            torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8)
        enhanced_log_mel = torch.log(
            torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8)

        alpha = compute_mask(mixed_spec, clean_spec, mask_type)
        alpha = alpha.permute(0, 2, 1)
        loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel,
                                             alpha)
        loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask)
        # print(loss.item(), loss2.item()) #approx 1:4
        loss = loss + loss2
        return loss

    def wav2vec_loss(mixed, clean, masks, nframes, subtask=None):
        mixed /= 32768
        clean /= 32768
        mixed_spec = stft(mixed)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(masks)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks_est = masks * mask_for_loss

        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
        est_clean = istft(estimate, clean.shape[1])
        loss = wav2vec_loss_module(est_clean, clean)
        return loss

    def sisdr_loss_dlen(mixed,
                        clean,
                        masks,
                        nframes,
                        subtask=None,
                        zero_mean=True):
        mixed_spec = stft(mixed)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(masks)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks_est = masks * mask_for_loss

        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)
        est_clean = istft(estimate, clean.shape[1])
        flen = min(clean.shape[1], est_clean.shape[1])
        clean = clean[:, :flen]
        est_clean = est_clean[:, :flen]

        # follow asteroid/losses/sdr.py
        if zero_mean:
            clean = clean - torch.mean(clean, dim=1, keepdim=True)
            est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True)

        dot = torch.sum(est_clean * clean, dim=1, keepdim=True)
        s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS
        scaled_clean = dot * clean / s_clean_energy
        e_noise = est_clean - scaled_clean

        # [batch]
        sisdr = torch.sum(
            scaled_clean**2, dim=1) / (
                torch.sum(e_noise**2, dim=1) + EPS)
        sisdr = -10 * torch.log10(sisdr + EPS)
        loss = sisdr.mean()
        return loss

    def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed)
        clean_spec = stft(clean)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(masks)
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        masks_est = masks * mask_for_loss

        estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3)

        dot_real = estimate[..., 0] * clean_spec[..., 0] + \
            estimate[..., 1] * clean_spec[..., 1]
        dot_imag = estimate[..., 0] * clean_spec[..., 1] - \
            estimate[..., 1] * clean_spec[..., 0]
        dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1)
        s_clean_energy = clean_spec[..., 0] ** 2 + \
            clean_spec[..., 1] ** 2 + EPS
        scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3)
        e_noise = estimate - scaled_clean

        # [batch]
        scaled_clean_energy = torch.sum(
            scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1)
        e_noise_energy = torch.sum(
            e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1)
        sisdr = torch.sum(
            scaled_clean_energy, dim=1) / (
                torch.sum(e_noise_energy, dim=1) + EPS)
        sisdr = -10 * torch.log10(sisdr + EPS)
        loss = sisdr.mean()
        return loss

    def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None):
        mixed_spec = stft(mixed).permute([0, 2, 1, 3])
        clean_spec = stft(clean).permute([0, 2, 1, 3])
        mixed_spec = mixed_spec / 32768
        clean_spec = clean_spec / 32768
        tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm')

        D = int(masks.shape[2] / 2)
        with torch.no_grad():
            mask_for_loss = torch.ones_like(clean_spec[..., 0])
            for idx, num in enumerate(nframes):
                mask_for_loss[idx, num:, :] = 0
        mr = masks[..., :D] * mask_for_loss
        mi = masks[..., D:] * mask_for_loss
        tgt_mr = tgt_mr * mask_for_loss
        tgt_mi = tgt_mi * mask_for_loss

        if weight is None:
            alpha = 1
        else:
            alpha = weight - tgt_mr
        # signal approximation
        yr = mixed_spec[..., 0]
        yi = mixed_spec[..., 1]
        loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \
            + torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2))
        # mask approximation
        loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \
            + torch.sum(alpha * torch.pow(mi - tgt_mi, 2))
        loss = 0.5 * (loss1 + loss2) / torch.sum(nframes)
        return loss

    def crm_miso_loss_dlen(mixed, clean, masks, nframes):
        return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes)

    def mimo_loss_dlen(mixed, clean, masks, nframes):
        chs = mixed.shape[-1]
        D = masks.shape[2] // chs
        loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D],
                                 nframes)
        for ch in range(1, chs):
            loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch],
                                      masks[..., ch * D:ch * D + D], nframes)
            loss = loss + loss1
        return loss / chs

    def spec_loss_dlen(mixed, clean, spec, nframes):
        clean_spec = stft(clean).permute([0, 2, 1, 3])
        clean_spec = clean_spec / 32768

        D = spec.shape[2] // 2
        spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]],
                             dim=-1)
        loss = spectrum_loss(clean_spec, spec_est, nframes)
        return loss

    if loss_func == 'psm_vad_loss_dlen':
        return psm_vad_loss_dlen
    elif loss_func == 'sisdr_loss_dlen':
        return sisdr_loss_dlen
    elif loss_func == 'sisdr_freq_loss_dlen':
        return sisdr_freq_loss_dlen
    elif loss_func == 'crm_loss_dlen':
        return crm_loss_dlen
    elif loss_func == 'modulation_loss':
        return modulation_loss
    elif loss_func == 'wav2vec_loss':
        return wav2vec_loss
    elif loss_func == 'mimo_loss_dlen':
        return mimo_loss_dlen
    elif loss_func == 'spec_loss_dlen':
        return spec_loss_dlen
    elif loss_func == 'sa_loss_dlen':
        return sa_loss_dlen
    else:
        print('error loss func')
        return None
--- a/modelscope/models/audio/network/modulation_loss.py
+++ b/modelscope/models/audio/network/modulation_loss.py
@@ -0,0 +1,248 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torchaudio.transforms import MelScale


 class ModulationDomainLossModule(torch.nn.Module):
    """Modulation-domain loss function developed in [1] for supervised speech enhancement

        In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
        as the input spectrogram representation.
        Specific parameter details are in the paper and in the example below

        Parameters
        ----------
        modulation_kernels: nn.Module
            Differentiable module that transforms a spectrogram representation to the modulation domain

            modulation_domain = modulation_kernels(input_tf_representation)
            Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F')

        norm: boolean
            Normalizes the modulation domain representation to be 0 mean across time

        [1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time
         speech enhancement”
            Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330


    """

    def __init__(self, modulation_kernels, norm=True):
        super(ModulationDomainLossModule, self).__init__()

        self.modulation_kernels = modulation_kernels
        self.mse = nn.MSELoss(reduce=False)
        self.norm = norm

    def forward(self, enhanced_spect, clean_spect, weight=None):
        """Calculate modulation-domain loss
        Args:
            enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
            clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
        Returns:
            Tensor: Modulation-domain loss value.
        """

        clean_mod = self.modulation_kernels(clean_spect)
        enhanced_mod = self.modulation_kernels(enhanced_spect)

        if self.norm:
            mean_clean_mod = torch.mean(clean_mod, dim=2)
            mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

            clean_mod = clean_mod - mean_clean_mod.unsqueeze(2)
            enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

        if weight is None:
            alpha = 1
        else:  # TF-mask weight
            alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1)
        mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha
        mod_mse_loss = torch.mean(
            torch.sum(mod_mse_loss, dim=(1, 2, 3))
            / torch.sum(clean_mod**2, dim=(1, 2, 3)))

        return mod_mse_loss


 class ModulationDomainNCCLossModule(torch.nn.Module):
    """Modulation-domain loss function developed in [1] for supervised speech enhancement

        # Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this

        In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram
        as the input spectrogram representation.
        Specific parameter details are in the paper and in the example below

        Parameters
        ----------
        modulation_kernels: nn.Module
            Differentiable module that transforms a spectrogram representation to the modulation domain

            modulation_domain = modulation_kernels(input_tf_representation)
            Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F')

        [1]

    """

    def __init__(self, modulation_kernels):
        super(ModulationDomainNCCLossModule, self).__init__()

        self.modulation_kernels = modulation_kernels
        self.mse = nn.MSELoss(reduce=False)

    def forward(self, enhanced_spect, clean_spect):
        """Calculate modulation-domain loss
        Args:
            enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels).
            clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels).
        Returns:
            Tensor: Modulation-domain loss value.
        """

        clean_mod = self.modulation_kernels(clean_spect)
        enhanced_mod = self.modulation_kernels(enhanced_spect)
        mean_clean_mod = torch.mean(clean_mod, dim=2)
        mean_enhanced_mod = torch.mean(enhanced_mod, dim=2)

        normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2)
        normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2)

        inner_product = torch.sum(
            normalized_clean * normalized_enhanced, dim=2)
        normalized_denom = (torch.sum(
            normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum(
                normalized_enhanced * normalized_enhanced, dim=2))**.5

        ncc = inner_product / normalized_denom
        mod_mse_loss = torch.mean((ncc - 1.0)**2)

        return mod_mse_loss


 class GaborSTRFConv(nn.Module):
    """Gabor-STRF-based cross-correlation kernel."""

    def __init__(self,
                 supn,
                 supk,
                 nkern,
                 rates=None,
                 scales=None,
                 norm_strf=True,
                 real_only=False):
        """Instantiate a Gabor-based STRF convolution layer.
        Parameters
        ----------
        supn: int
            Time support in number of frames. Also the window length.
        supk: int
            Frequency support in number of channels. Also the window length.
        nkern: int
            Number of kernels, each with a learnable rate and scale.
        rates: list of float, None
            Initial values for temporal modulation.
        scales: list of float, None
            Initial values for spectral modulation.
        norm_strf: Boolean
            Normalize STRF kernels to be unit length
        real_only: Boolean
            If True, nkern REAL gabor-STRF kernels
            If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels
        """
        super(GaborSTRFConv, self).__init__()
        self.numN = supn
        self.numK = supk
        self.numKern = nkern
        self.real_only = real_only
        self.norm_strf = norm_strf

        if not real_only:
            nkern = nkern // 2

        if supk % 2 == 0:  # force odd number
            supk += 1
        self.supk = torch.arange(supk, dtype=torch.float32)
        if supn % 2 == 0:  # force odd number
            supn += 1
        self.supn = torch.arange(supn, dtype=self.supk.dtype)
        self.padding = (supn // 2, supk // 2)
        # Set up learnable parameters
        # for param in (rates, scales):
        #    assert (not param) or len(param) == nkern
        if not rates:

            rates = torch.rand(nkern) * math.pi / 2.0

        if not scales:

            scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0

        self.rates_ = nn.Parameter(torch.Tensor(rates))
        self.scales_ = nn.Parameter(torch.Tensor(scales))

    def strfs(self):
        """Make STRFs using the current parameters."""

        if self.supn.device != self.rates_.device:  # for first run
            self.supn = self.supn.to(self.rates_.device)
            self.supk = self.supk.to(self.rates_.device)
        n0, k0 = self.padding

        nwind = .5 - .5 * \
            torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1))
        kwind = .5 - .5 * \
            torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1))

        new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0))

        n_n_0 = self.supn - n0
        k_k_0 = self.supk - k0
        n_mult = torch.matmul(
            n_n_0.unsqueeze(1),
            torch.ones((1, len(self.supk))).type(torch.FloatTensor).to(
                self.rates_.device))
        k_mult = torch.matmul(
            torch.ones((len(self.supn),
                        1)).type(torch.FloatTensor).to(self.rates_.device),
            k_k_0.unsqueeze(0))

        inside = self.rates_.unsqueeze(1).unsqueeze(
            1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult
        real_strf = torch.cos(inside) * new_wind.unsqueeze(0)

        if self.real_only:
            final_strf = real_strf

        else:
            imag_strf = torch.sin(inside) * new_wind.unsqueeze(0)
            final_strf = torch.cat([real_strf, imag_strf], dim=0)

        if self.norm_strf:
            final_strf = final_strf / (torch.sum(
                final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5

        return final_strf

    def forward(self, sigspec):
        """Forward pass a batch of (real) spectra [Batch x Time x Frequency]."""
        if len(sigspec.shape) == 2:  # expand batch dimension if single eg
            sigspec = sigspec.unsqueeze(0)
        strfs = self.strfs().unsqueeze(1).type_as(sigspec)
        out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding)
        return out

    def __repr__(self):
        """Gabor filter"""
        report = """
            +++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++

        """.format(self.numKern, self.numN, self.numK, self.real_only,
                   self.norm_strf)

        return report
--- a/modelscope/models/audio/network/se_net.py
+++ b/modelscope/models/audio/network/se_net.py
@@ -0,0 +1,483 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from ..layers.activations import RectifiedLinear, Sigmoid
 from ..layers.affine_transform import AffineTransform
 from ..layers.deep_fsmn import DeepFsmn
 from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn


 class MaskNet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 hidden_dim=128,
                 hidden_dim2=None,
                 lorder=20,
                 rorder=0,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(MaskNet, self).__init__()

        self.linear1 = AffineTransform(indim, hidden_dim)
        self.relu = RectifiedLinear(hidden_dim, hidden_dim)
        if hidden_dim2 is None:
            hidden_dim2 = hidden_dim

        if rorder == 0:
            repeats = [
                UniDeepFsmn(
                    hidden_dim,
                    hidden_dim,
                    lorder,
                    hidden_dim2,
                    dilation=dilation,
                    layer_norm=layer_norm,
                    dropout=dropout) for i in range(layers)
            ]
        else:
            repeats = [
                DeepFsmn(
                    hidden_dim,
                    hidden_dim,
                    lorder,
                    rorder,
                    hidden_dim2,
                    layer_norm=layer_norm,
                    dropout=dropout) for i in range(layers)
            ]
        self.deepfsmn = nn.Sequential(*repeats)

        self.linear2 = AffineTransform(hidden_dim, outdim)

        self.crm = crm
        if self.crm:
            self.sig = nn.Tanh()
        else:
            self.sig = Sigmoid(outdim, outdim)

        self.vad = vad
        if self.vad:
            self.linear3 = AffineTransform(hidden_dim, 1)

        self.layers = layers
        self.linearout = linearout
        if self.linearout and self.vad:
            print('Warning: not supported nnet')

    def forward(self, feat, ctl=None):
        x1 = self.linear1(feat)
        x2 = self.relu(x1)
        if ctl is not None:
            ctl = min(ctl, self.layers - 1)
            for i in range(ctl):
                x2 = self.deepfsmn[i](x2)
            mask = self.sig(self.linear2(x2))
            if self.vad:
                vad = torch.sigmoid(self.linear3(x2))
                return mask, vad
            else:
                return mask
        x3 = self.deepfsmn(x2)
        if self.linearout:
            return self.linear2(x3)
        mask = self.sig(self.linear2(x3))
        if self.vad:
            vad = torch.sigmoid(self.linear3(x3))
            return mask, vad
        else:
            return mask

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Nnet>\n'
        re_str += self.linear1.to_kaldi_nnet()
        re_str += self.relu.to_kaldi_nnet()
        for dfsmn in self.deepfsmn:
            re_str += dfsmn.to_kaldi_nnet()
        re_str += self.linear2.to_kaldi_nnet()
        re_str += self.sig.to_kaldi_nnet()
        re_str += '</Nnet>\n'

        return re_str

    def to_raw_nnet(self, fid):
        self.linear1.to_raw_nnet(fid)
        for dfsmn in self.deepfsmn:
            dfsmn.to_raw_nnet(fid)
        self.linear2.to_raw_nnet(fid)


 class StageNet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 layers2=6,
                 hidden_dim=128,
                 lorder=20,
                 rorder=0,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(StageNet, self).__init__()

        self.stage1 = nn.ModuleList()
        self.stage2 = nn.ModuleList()
        layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU())
        self.stage1.append(layer)
        for i in range(layers):
            layer = UniDeepFsmn(
                hidden_dim,
                hidden_dim,
                lorder,
                hidden_dim,
                layer_norm=layer_norm,
                dropout=dropout)
            self.stage1.append(layer)
        layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid())
        self.stage1.append(layer)
        # stage2
        layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU())
        self.stage2.append(layer)
        for i in range(layers2):
            layer = UniDeepFsmn(
                hidden_dim,
                hidden_dim,
                lorder,
                hidden_dim,
                layer_norm=layer_norm,
                dropout=dropout)
            self.stage2.append(layer)
        layer = nn.Sequential(
            nn.Linear(hidden_dim, outdim),
            nn.Sigmoid() if not crm else nn.Tanh())
        self.stage2.append(layer)
        self.crm = crm
        self.vad = vad
        self.linearout = linearout
        self.window = torch.hamming_window(640, periodic=False).cuda()
        self.freezed = False

    def freeze(self):
        if not self.freezed:
            for param in self.stage1.parameters():
                param.requires_grad = False
            self.freezed = True
            print('freezed stage1')

    def forward(self, feat, mixture, ctl=None):
        if ctl == 'off':
            x = feat
            for i in range(len(self.stage1)):
                x = self.stage1[i](x)
            return x
        else:
            self.freeze()
            x = feat
            for i in range(len(self.stage1)):
                x = self.stage1[i](x)

            spec = torch.stft(
                mixture / 32768,
                640,
                320,
                640,
                self.window,
                center=False,
                return_complex=True)
            spec = torch.view_as_real(spec).permute([0, 2, 1, 3])
            specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2)
            est = x * specmag
            y = torch.cat([est, feat], dim=-1)
            for i in range(len(self.stage2)):
                y = self.stage2[i](y)
            return y


 class Unet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 dims=[256] * 4,
                 lorder=20,
                 rorder=0,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(Unet, self).__init__()

        self.linear1 = AffineTransform(indim, dims[0])
        self.relu = RectifiedLinear(dims[0], dims[0])

        self.encoder = nn.ModuleList()
        self.decoder = nn.ModuleList()
        for i in range(len(dims) - 1):
            layer = nn.Sequential(
                nn.Linear(dims[i], dims[i + 1]), nn.ReLU(),
                nn.Linear(dims[i + 1], dims[i + 1], bias=False),
                Conv2d(
                    dims[i + 1],
                    dims[i + 1],
                    lorder,
                    groups=dims[i + 1],
                    skip_connect=True))
            self.encoder.append(layer)
        for i in range(len(dims) - 1, 0, -1):
            layer = nn.Sequential(
                nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(),
                nn.Linear(dims[i - 1], dims[i - 1], bias=False),
                Conv2d(
                    dims[i - 1],
                    dims[i - 1],
                    lorder,
                    groups=dims[i - 1],
                    skip_connect=True))
            self.decoder.append(layer)
        self.tf = nn.ModuleList()
        for i in range(layers - 2 * (len(dims) - 1)):
            layer = nn.Sequential(
                nn.Linear(dims[-1], dims[-1]), nn.ReLU(),
                nn.Linear(dims[-1], dims[-1], bias=False),
                Conv2d(
                    dims[-1],
                    dims[-1],
                    lorder,
                    groups=dims[-1],
                    skip_connect=True))
            self.tf.append(layer)

        self.linear2 = AffineTransform(dims[0], outdim)
        self.crm = crm
        self.act = nn.Tanh() if self.crm else nn.Sigmoid()
        self.vad = False
        self.layers = layers
        self.linearout = linearout

    def forward(self, x, ctl=None):
        x = self.linear1(x)
        x = self.relu(x)

        encoder_out = []
        for i in range(len(self.encoder)):
            x = self.encoder[i](x)
            encoder_out.append(x)
        for i in range(len(self.tf)):
            x = self.tf[i](x)
        for i in range(len(self.decoder)):
            x = torch.cat([x, encoder_out[-1 - i]], dim=-1)
            x = self.decoder[i](x)

        x = self.linear2(x)
        if self.linearout:
            return x
        return self.act(x)


 class BranchNet(nn.Module):

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 hidden_dim=256,
                 lorder=20,
                 rorder=0,
                 dilation=1,
                 layer_norm=False,
                 dropout=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(BranchNet, self).__init__()

        self.linear1 = AffineTransform(indim, hidden_dim)
        self.relu = RectifiedLinear(hidden_dim, hidden_dim)

        self.convs = nn.ModuleList()
        self.deepfsmn = nn.ModuleList()
        self.FREQ = nn.ModuleList()
        self.TIME = nn.ModuleList()
        self.br1 = nn.ModuleList()
        self.br2 = nn.ModuleList()
        for i in range(layers):
            '''
            layer = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim, bias=False),
                Conv2d(hidden_dim, hidden_dim, lorder,
                       groups=hidden_dim, skip_connect=True)
            )
            self.deepfsmn.append(layer)
            '''
            layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())
            self.FREQ.append(layer)
            '''
            layer = nn.GRU(hidden_dim, hidden_dim,
                           batch_first=True,
                           bidirectional=False)
            self.TIME.append(layer)

            layer = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim//2, bias=False),
                Conv2d(hidden_dim//2, hidden_dim//2, lorder,
                       groups=hidden_dim//2, skip_connect=True)
            )
            self.br1.append(layer)
            layer = nn.GRU(hidden_dim, hidden_dim//2,
                           batch_first=True,
                           bidirectional=False)
            self.br2.append(layer)
            '''

        self.linear2 = AffineTransform(hidden_dim, outdim)
        self.crm = crm
        self.act = nn.Tanh() if self.crm else nn.Sigmoid()
        self.vad = False
        self.layers = layers
        self.linearout = linearout

    def forward(self, x, ctl=None):
        return self.forward_branch(x)

    def forward_sepconv(self, x):
        x = torch.unsqueeze(x, 1)
        for i in range(len(self.convs)):
            x = self.convs[i](x)
            x = F.relu(x)
        B, C, H, W = x.shape
        x = x.permute(0, 2, 1, 3)
        x = torch.reshape(x, [B, H, C * W])
        x = self.linear1(x)
        x = self.relu(x)
        for i in range(self.layers):
            x = self.deepfsmn[i](x) + x
        x = self.linear2(x)
        return self.act(x)

    def forward_branch(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        for i in range(self.layers):
            z = self.FREQ[i](x)
            x = z + x
        x = self.linear2(x)
        if self.linearout:
            return x
        return self.act(x)


 class TACNet(nn.Module):
    ''' transform average concatenate for ad hoc dr
    '''

    def __init__(self,
                 indim,
                 outdim,
                 layers=9,
                 hidden_dim=128,
                 lorder=20,
                 rorder=0,
                 crm=False,
                 vad=False,
                 linearout=False):
        super(TACNet, self).__init__()

        self.linear1 = AffineTransform(indim, hidden_dim)
        self.relu = RectifiedLinear(hidden_dim, hidden_dim)

        if rorder == 0:
            repeats = [
                UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim)
                for i in range(layers)
            ]
        else:
            repeats = [
                DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim)
                for i in range(layers)
            ]
        self.deepfsmn = nn.Sequential(*repeats)

        self.ch_transform = nn.ModuleList([])
        self.ch_average = nn.ModuleList([])
        self.ch_concat = nn.ModuleList([])
        for i in range(layers):
            self.ch_transform.append(
                nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
            self.ch_average.append(
                nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU()))
            self.ch_concat.append(
                nn.Sequential(
                    nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU()))

        self.linear2 = AffineTransform(hidden_dim, outdim)

        self.crm = crm
        if self.crm:
            self.sig = nn.Tanh()
        else:
            self.sig = Sigmoid(outdim, outdim)

        self.vad = vad
        if self.vad:
            self.linear3 = AffineTransform(hidden_dim, 1)

        self.layers = layers
        self.linearout = linearout
        if self.linearout and self.vad:
            print('Warning: not supported nnet')

    def forward(self, feat, ctl=None):
        B, T, F = feat.shape
        # assume 4ch
        ch = 4
        zlist = []
        for c in range(ch):
            z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)])
            z = self.relu(z)
            zlist.append(z)
        for i in range(self.layers):
            # forward
            for c in range(ch):
                zlist[c] = self.deepfsmn[i](zlist[c])

            # transform
            olist = []
            for c in range(ch):
                z = self.ch_transform[i](zlist[c])
                olist.append(z)
            # average
            avg = 0
            for c in range(ch):
                avg = avg + olist[c]
            avg = avg / ch
            avg = self.ch_average[i](avg)
            # concate
            for c in range(ch):
                tac = torch.cat([olist[c], avg], dim=-1)
                tac = self.ch_concat[i](tac)
                zlist[c] = zlist[c] + tac

        for c in range(ch):
            zlist[c] = self.sig(self.linear2(zlist[c]))
        mask = torch.cat(zlist, dim=-1)
        return mask

    def to_kaldi_nnet(self):
        pass
--- a/modelscope/models/base.py
+++ b/modelscope/models/base.py
@@ -2,14 +2,13 @@

 import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Dict, List, Tuple, Union
 from typing import Dict, Union

 from maas_hub.file_download import model_file_download
 from maas_hub.snapshot_download import snapshot_download

 from modelscope.models.builder import build_model
 from modelscope.utils.config import Config
 from modelscope.utils.constant import CONFIGFILE
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.hub import get_model_cache_dir

 Tensor = Union['torch.Tensor', 'tf.Tensor']
@@ -21,16 +20,24 @@ class Model(ABC):
        self.model_dir = model_dir

    def __call__(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        return self.post_process(self.forward(input))
        return self.postprocess(self.forward(input))

    @abstractmethod
    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        pass

    def post_process(self, input: Dict[str, Tensor],
                     **kwargs) -> Dict[str, Tensor]:
        # model specific postprocess, implementation is optional
        # will be called in Pipeline and evaluation loop(in the future)
    def postprocess(self, input: Dict[str, Tensor],
                    **kwargs) -> Dict[str, Tensor]:
        """ Model specific postprocess and convert model output to
        standard model outputs.

        Args:
            inputs:  input data

        Return:
            dict of results:  a dict containing outputs of model, each
                output should have the standard output name.
        """
        return input

    @classmethod
@@ -47,7 +54,8 @@ class Model(ABC):
            #     raise ValueError(
            #         'Remote model repo {model_name_or_path} does not exists')

        cfg = Config.from_file(osp.join(local_model_dir, CONFIGFILE))
        cfg = Config.from_file(
            osp.join(local_model_dir, ModelFile.CONFIGURATION))
        task_name = cfg.task
        model_cfg = cfg.model
        # TODO @wenmeng.zwm may should manually initialize model after model building
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -1,4 +1,6 @@
 from .sequence_classification_model import *  # noqa F403
 from .bert_for_sequence_classification import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
 from .space.dialog_intent_prediction_model import *  # noqa F403
 from .space.dialog_modeling_model import *  # noqa F403
 from .text_generation_model import *  # noqa F403
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -1,5 +1,7 @@
 import os
 from typing import Any, Dict

 import json
 import numpy as np

 from modelscope.utils.constant import Tasks
@@ -34,6 +36,11 @@ class BertForSequenceClassification(Model):
                        ('token_type_ids', torch.LongTensor)],
            output_keys=['predictions', 'probabilities', 'logits'])

        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
        with open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.id2label = {idx: name for name, idx in self.label_mapping.items()}

    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        """return the result by the model

@@ -50,3 +57,13 @@ class BertForSequenceClassification(Model):
                    }
        """
        return self.model.predict(input)

    def postprocess(self, inputs: Dict[str, np.ndarray],
                    **kwargs) -> Dict[str, np.ndarray]:
        # N x num_classes
        probs = inputs['probabilities']
        result = {
            'probs': probs,
        }

        return result
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_for_text_generation.py
@@ -0,0 +1,43 @@
 from typing import Dict

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['PalmForTextGeneration']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
 class PalmForTextGeneration(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir

        from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
        model = PalmForConditionalGeneration.from_pretrained(model_dir)
        self.tokenizer = model.tokenizer
        self.generator = Translator(model)

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model

        Args:
            input (Dict[str, Tensor]): the preprocessed data

        Returns:
            Dict[str, Tensor]: results
                Example:
                    {
                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer
                    }
        """

        return self.generator(**input)
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ b/modelscope/models/nlp/sbert_for_sentence_similarity.py
@@ -0,0 +1,88 @@
 import os
 from typing import Any, Dict

 import json
 import numpy as np
 import torch
 from sofa import SbertModel
 from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
 from torch import nn

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['SbertForSentenceSimilarity']


 class SbertTextClassifier(SbertPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.encoder = SbertModel(config, add_pooling_layer=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, token_type_ids=None):
        outputs = self.encoder(
            input_ids,
            token_type_ids=token_type_ids,
            return_dict=None,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


@MODELS.register_module(
    Tasks.sentence_similarity,
    module_name=r'sbert-base-chinese-sentence-similarity')
 class SbertForSentenceSimilarity(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the sentence similarity model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir

        self.model = SbertTextClassifier.from_pretrained(
            model_dir, num_labels=2)
        self.model.eval()
        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
        with open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.id2label = {idx: name for name, idx in self.label_mapping.items()}

    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """
        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
        token_type_ids = torch.tensor(
            input['token_type_ids'], dtype=torch.long)
        with torch.no_grad():
            logits = self.model(input_ids, token_type_ids)
        probs = logits.softmax(-1).numpy()
        pred = logits.argmax(-1).numpy()
        logits = logits.numpy()
        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
        return res
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -0,0 +1,56 @@
 from typing import Any, Dict, Union

 import numpy as np
 import torch
 from sofa import SbertConfig, SbertForTokenClassification

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['StructBertForTokenClassification']


@MODELS.register_module(
    Tasks.word_segmentation,
    module_name=r'structbert-chinese-word-segmentation')
 class StructBertForTokenClassification(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the word segmentation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        self.model = SbertForTokenClassification.from_pretrained(
            self.model_dir)
        self.config = SbertConfig.from_pretrained(self.model_dir)

    def forward(self, input: Dict[str,
                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, Union[str,np.ndarray]]: results
                Example:
                    {
                        'predictions': array([1,4]), # lable 0-negative 1-positive
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                        'text': str(今天),
                    }
        """
        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
        output = self.model(input_ids)
        logits = output.logits
        pred = torch.argmax(logits[0], dim=-1)
        pred = pred.numpy()

        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
        return rst
--- a/modelscope/models/nlp/text_generation_model.py
+++ b/modelscope/models/nlp/text_generation_model.py
@@ -1,52 +0,0 @@
 from typing import Any, Dict

 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['PalmForTextGenerationModel']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm')
 class PalmForTextGenerationModel(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        from sofa import PalmTokenizer

        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir

        from sofa.models.palm import PalmForConditionalGeneration, TextGenerator
        tokenizer = kwargs.pop('tokenizer',
                               PalmTokenizer.from_pretrained(model_dir))
        model = PalmForConditionalGeneration.from_pretrained(model_dir)
        self.generator = TextGenerator(model, tokenizer)

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """

        encoder_inputs = [
            input['input_ids'], input['token_type_ids'],
            input['attention_mask']
        ]
        return self.generator(encoder_inputs)
--- a/modelscope/pipelines/init.py
+++ b/modelscope/pipelines/init.py
@@ -1,4 +1,4 @@
 from .audio import *  # noqa F403
 from .audio import LinearAECPipeline
 from .base import Pipeline
 from .builder import pipeline
 from .cv import *  # noqa F403
--- a/modelscope/pipelines/audio/init.py
+++ b/modelscope/pipelines/audio/init.py
@@ -0,0 +1 @@
 from .linear_aec_pipeline import LinearAECPipeline
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -0,0 +1,160 @@
 import importlib
 import os
 from typing import Any, Dict

 import numpy as np
 import scipy.io.wavfile as wav
 import torch
 import yaml

 from modelscope.preprocessors.audio import LinearAECAndFbank
 from modelscope.utils.constant import ModelFile, Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES

 FEATURE_MVN = 'feature.DEY.mvn.txt'

 CONFIG_YAML = 'dey_mini.yaml'


 def initialize_config(module_cfg):
    r"""According to config items, load specific module dynamically with params.
        1. Load the module corresponding to the "module" param.
        2. Call function (or instantiate class) corresponding to the "main" param.
        3. Send the param (in "args") into the function (or class) when calling ( or instantiating).

    Args:
        module_cfg (dict): config items, eg:
            {
                "module": "models.model",
                "main": "Model",
                "args": {...}
            }

    Returns:
        the module loaded.
    """
    module = importlib.import_module(module_cfg['module'])
    return getattr(module, module_cfg['main'])(**module_cfg['args'])


@PIPELINES.register_module(
    Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
 class LinearAECPipeline(Pipeline):
    r"""AEC Inference Pipeline only support 16000 sample rate.

    When invoke the class with pipeline.__call__(), you should provide two params:
        Dict[str, Any]
            the path of wav files，eg:{
            "nearend_mic": "/your/data/near_end_mic_audio.wav",
            "farend_speech": "/your/data/far_end_speech_audio.wav"}
        output_path (str, optional): "/your/output/audio_after_aec.wav"
            the file path to write generate audio.
    """

    def __init__(self, model):
        r"""
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model)
        self.use_cuda = torch.cuda.is_available()
        with open(
                os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f:
            self.config = yaml.full_load(f.read())
            self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN)
        self._init_model()
        self.preprocessor = LinearAECAndFbank(self.config['io'])

        n_fft = self.config['loss']['args']['n_fft']
        hop_length = self.config['loss']['args']['hop_length']
        winlen = n_fft
        window = torch.hamming_window(winlen, periodic=False)

        def stft(x):
            return torch.stft(
                x,
                n_fft,
                hop_length,
                winlen,
                center=False,
                window=window.to(x.device),
                return_complex=False)

        def istft(x, slen):
            return torch.istft(
                x,
                n_fft,
                hop_length,
                winlen,
                window=window.to(x.device),
                center=False,
                length=slen)

        self.stft = stft
        self.istft = istft

    def _init_model(self):
        checkpoint = torch.load(
            os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE),
            map_location='cpu')
        self.model = initialize_config(self.config['nnet'])
        if self.use_cuda:
            self.model = self.model.cuda()
        self.model.load_state_dict(checkpoint)

    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        r"""The AEC process.

        Args:
            inputs: dict={'feature': Tensor, 'base': Tensor}
                'feature' feature of input audio.
                'base' the base audio to mask.

        Returns:
            dict:
                {
                    'output_pcm': generated audio array
                }
        """
        output_data = self._process(inputs['feature'], inputs['base'])
        return {'output_pcm': output_data}

    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        r"""The post process. Will save audio to file, if the output_path is given.

        Args:
            inputs: dict:
                {
                    'output_pcm': generated audio array
                }
            kwargs: accept 'output_path' which is the path to write generated audio

        Returns:
            dict:
                {
                    'output_pcm': generated audio array
                }
        """
        if 'output_path' in kwargs.keys():
            wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE,
                      inputs['output_pcm'].astype(np.int16))
        inputs['output_pcm'] = inputs['output_pcm'] / 32768.0
        return inputs

    def _process(self, fbanks, mixture):
        if self.use_cuda:
            fbanks = fbanks.cuda()
            mixture = mixture.cuda()
        if self.model.vad:
            with torch.no_grad():
                masks, vad = self.model(fbanks.unsqueeze(0))
                masks = masks.permute([2, 1, 0])
        else:
            with torch.no_grad():
                masks = self.model(fbanks.unsqueeze(0))
                masks = masks.permute([2, 1, 0])
        spectrum = self.stft(mixture)
        masked_spec = spectrum * masks
        masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy()
        return masked_sig
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -12,10 +12,11 @@ from modelscope.pydatasets import PyDataset
 from modelscope.utils.config import Config
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.logger import get_logger
 from .outputs import TASK_OUTPUTS
 from .util import is_model_name

 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, PyDataset, Dict, 'PIL.Image.Image', 'numpy.ndarray']
 Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
 InputModel = Union[str, Model]

 output_keys = [
@@ -106,8 +107,25 @@ class Pipeline(ABC):
        out = self.preprocess(input, **post_kwargs)
        out = self.forward(out)
        out = self.postprocess(out, **post_kwargs)
        self._check_output(out)
        return out

    def _check_output(self, input):
        # this attribute is dynamically attached by registry
        # when cls is registered in registry using task name
        task_name = self.group_key
        if task_name not in TASK_OUTPUTS:
            logger.warning(f'task {task_name} output keys are missing')
            return
        output_keys = TASK_OUTPUTS[task_name]
        missing_keys = []
        for k in output_keys:
            if k not in input:
                missing_keys.append(k)
        if len(missing_keys) > 0:
            raise ValueError(f'expected output keys are {output_keys}, '
                             f'those {missing_keys} are missing')

    def preprocess(self, inputs: Input) -> Dict[str, Any]:
        """ Provide default implementation based on preprocess_cfg and user can reimplement it
        """
@@ -125,4 +143,14 @@ class Pipeline(ABC):

    @abstractmethod
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        """ If current pipeline support model reuse, common postprocess
            code should be write here.

        Args:
            inputs:  input data

        Return:
            dict of results:  a dict containing outputs of model, each
                output should have the standard output name.
        """
        raise NotImplementedError('postprocess')
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -3,24 +3,27 @@
 import os.path as osp
 from typing import List, Union

 import json
 from maas_hub.file_download import model_file_download

 from modelscope.models.base import Model
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import CONFIGFILE, Tasks
 from modelscope.utils.constant import Tasks
 from modelscope.utils.registry import Registry, build_from_cfg
 from .base import Pipeline
 from .util import is_model_name

 PIPELINES = Registry('pipelines')

 DEFAULT_MODEL_FOR_PIPELINE = {
    # TaskName: (pipeline_module_name, model_repo)
    Tasks.image_matting: ('image-matting', 'damo/image-matting-person'),
    Tasks.word_segmentation:
    ('structbert-chinese-word-segmentation',
     'damo/nlp_structbert_word-segmentation_chinese-base'),
    Tasks.sentence_similarity:
    ('sbert-base-chinese-sentence-similarity',
     'damo/nlp_structbert_sentence-similarity_chinese-base'),
    Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
    Tasks.text_classification:
    ('bert-sentiment-analysis', 'damo/bert-base-sst2'),
    Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'),
    Tasks.text_generation: ('palm2.0',
                            'damo/nlp_palm2.0_text-generation_chinese-base'),
    Tasks.image_captioning: ('ofa', None),
    Tasks.image_generation:
    ('person-image-cartoon',
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -1,5 +1,5 @@
 import os.path as osp
 from typing import Any, Dict, List, Tuple, Union
 from typing import Any, Dict

 import cv2
 import numpy as np
@@ -7,7 +7,7 @@ import PIL

 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import TF_GRAPH_FILE, Tasks
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES
@@ -24,7 +24,7 @@ class ImageMattingPipeline(Pipeline):
        import tensorflow as tf
        if tf.__version__ >= '2.0':
            tf = tf.compat.v1
        model_path = osp.join(self.model, TF_GRAPH_FILE)
        model_path = osp.join(self.model, ModelFile.TF_GRAPH_FILE)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -1 +1 @@
 from .image_captioning import ImageCaptionPipeline
 from .image_caption_pipeline import ImageCaptionPipeline
--- a/modelscope/pipelines/multi_modal/image_caption_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_caption_pipeline.py
@@ -84,8 +84,11 @@ class ImageCaptionPipeline(Pipeline):
                s = torch.cat([s, self.eos_item])
            return s

        patch_image = self.patch_resize_transform(
            load_image(input)).unsqueeze(0)
        if isinstance(input, Image.Image):
            patch_image = self.patch_resize_transform(input).unsqueeze(0)
        else:
            patch_image = self.patch_resize_transform(
                load_image(input)).unsqueeze(0)
        patch_mask = torch.tensor([True])
        text = 'what does the image describe?'
        src_text = encode_text(
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -1,4 +1,6 @@
 from .sentence_similarity_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
 from .space.dialog_intent_prediction_pipeline import *  # noqa F403
 from .space.dialog_modeling_pipeline import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
 from .word_segmentation_pipeline import *  # noqa F403
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -0,0 +1,62 @@
 from typing import Any, Dict, Union

 import numpy as np

 from modelscope.models.nlp import SbertForSentenceSimilarity
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...models import Model
 from ..base import Input, Pipeline
 from ..builder import PIPELINES

 __all__ = ['SentenceSimilarityPipeline']


@PIPELINES.register_module(
    Tasks.sentence_similarity,
    module_name=r'sbert-base-chinese-sentence-similarity')
 class SentenceSimilarityPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForSentenceSimilarity, str],
                 preprocessor: SequenceClassificationPreprocessor = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction

        Args:
            model (SbertForSentenceSimilarity): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """
        assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
            'model must be a single str or SbertForSentenceSimilarity'
        sc_model = model if isinstance(
            model,
            SbertForSentenceSimilarity) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = SequenceClassificationPreprocessor(
                sc_model.model_dir,
                first_sequence='first_sequence',
                second_sequence='second_sequence')
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

        assert hasattr(self.model, 'id2label'), \
            'id2label map should be initalizaed in init function.'

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """

        probs = inputs['probabilities'][0]
        num_classes = probs.shape[0]
        top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
        cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
        probs = probs[cls_ids].tolist()
        cls_names = [self.model.id2label[cid] for cid in cls_ids]
        b = 0
        return {'scores': probs[b], 'labels': cls_names[b]}
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,8 +1,5 @@
 import os
 import uuid
 from typing import Any, Dict, Union

 import json
 import numpy as np

 from modelscope.models.nlp import BertForSequenceClassification
@@ -41,50 +38,29 @@ class SequenceClassificationPipeline(Pipeline):
                second_sequence=None)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

        from easynlp.utils import io
        self.label_path = os.path.join(sc_model.model_dir,
                                       'label_mapping.json')
        with io.open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.label_id_to_name = {
            idx: name
            for name, idx in self.label_mapping.items()
        }
        assert hasattr(self.model, 'id2label'), \
            'id2label map should be initalizaed in init function.'

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def postprocess(self,
                    inputs: Dict[str, Any],
                    topk: int = 5) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_
            inputs (Dict[str, Any]): input data dict
            topk (int): return topk classification result.

        Returns:
            Dict[str, str]: the prediction results
        """
        # NxC np.ndarray
        probs = inputs['probs'][0]
        num_classes = probs.shape[0]
        topk = min(topk, num_classes)
        top_indices = np.argpartition(probs, -topk)[-topk:]
        cls_ids = top_indices[np.argsort(probs[top_indices])]
        probs = probs[cls_ids].tolist()

        probs = inputs['probabilities']
        logits = inputs['logits']
        predictions = np.argsort(-probs, axis=-1)
        preds = predictions[0]
        b = 0
        new_result = list()
        for pred in preds:
            new_result.append({
                'pred': self.label_id_to_name[pred],
                'prob': float(probs[b][pred]),
                'logit': float(logits[b][pred])
            })
        new_results = list()
        new_results.append({
            'id':
            inputs['id'][b] if 'id' in inputs else str(uuid.uuid4()),
            'output':
            new_result,
            'predictions':
            new_result[0]['pred'],
            'probabilities':
            ','.join([str(t) for t in inputs['probabilities'][b]]),
            'logits':
            ','.join([str(t) for t in inputs['logits'][b]])
        })
        cls_names = [self.model.id2label[cid] for cid in cls_ids]

        return new_results[0]
        return {'scores': probs, 'labels': cls_names}
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,7 +1,7 @@
 from typing import Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGenerationModel
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 from ..base import Pipeline, Tensor
@@ -10,11 +10,11 @@ from ..builder import PIPELINES
 __all__ = ['TextGenerationPipeline']


@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm')
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
 class TextGenerationPipeline(Pipeline):

    def __init__(self,
                 model: Union[PalmForTextGenerationModel, str],
                 model: Union[PalmForTextGeneration, str],
                 preprocessor: Optional[TextGenerationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline):
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """
        sc_model = model if isinstance(
            model,
            PalmForTextGenerationModel) else Model.from_pretrained(model)
        model = model if isinstance(
            model, PalmForTextGeneration) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TextGenerationPreprocessor(
                sc_model.model_dir,
                model.model_dir,
                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = model.tokenizer

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
        """process the prediction results
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline):
        Returns:
            Dict[str, str]: the prediction results
        """
        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
        replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>',
                                                                    ''),
                                  ('<s>', ''), ('</s>', ''), ('<unk>', ' '))

        vocab_size = len(self.tokenizer.vocab)
        pred_list = inputs['predictions']
        pred_ids = pred_list[0][0].cpu().numpy().tolist()
        for j in range(len(pred_ids)):
            if pred_ids[j] >= vocab_size:
                pred_ids[j] = 100
        pred = self.tokenizer.convert_ids_to_tokens(pred_ids)
        pred_string = ''.join(pred).replace(
            '##',
            '').split('[SEP]')[0].replace('[CLS]',
                                          '').replace('[SEP]',
                                                      '').replace('[UNK]', '')
        return {'pred_string': pred_string}
        pred_string = self.tokenizer.decode(pred_ids)
        for _old, _new in replace_tokens_bert:
            pred_string = pred_string.replace(_old, _new)
        pred_string.strip()
        for _old, _new in replace_tokens_roberta:
            pred_string = pred_string.replace(_old, _new)
        pred_string.strip()
        return {'text': pred_string}
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -0,0 +1,69 @@
 from typing import Any, Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

 __all__ = ['WordSegmentationPipeline']


@PIPELINES.register_module(
    Tasks.word_segmentation,
    module_name=r'structbert-chinese-word-segmentation')
 class WordSegmentationPipeline(Pipeline):

    def __init__(self,
                 model: Union[StructBertForTokenClassification, str],
                 preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction

        Args:
            model (StructBertForTokenClassification): a model instance
            preprocessor (TokenClassifcationPreprocessor): a preprocessor instance
        """
        model = model if isinstance(
            model,
            StructBertForTokenClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TokenClassifcationPreprocessor(model.model_dir)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        self.config = model.config
        self.id2label = self.config.id2label

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """

        pred_list = inputs['predictions']
        labels = []
        for pre in pred_list:
            labels.append(self.id2label[pre])
        labels = labels[1:-1]
        chunks = []
        chunk = ''
        assert len(inputs['text']) == len(labels)
        for token, label in zip(inputs['text'], labels):
            if label[0] == 'B' or label[0] == 'I':
                chunk += token
            else:
                chunk += token
                chunks.append(chunk)
                chunk = ''
        if chunk:
            chunks.append(chunk)
        seg_result = ' '.join(chunks)
        rst = {
            'output': seg_result,
        }
        return rst
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -0,0 +1,117 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from modelscope.utils.constant import Tasks

 TASK_OUTPUTS = {

    # ============ vision tasks ===================

    # image classification result for single sample
    #   {
    #       "labels": ["dog", "horse", "cow", "cat"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.image_classification: ['scores', 'labels'],
    Tasks.image_tagging: ['scores', 'labels'],

    # object detection result for single sample
    #   {
    #       "boxes": [
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #       ],
    #       "labels": ["dog", "horse", "cow", "cat"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.object_detection: ['scores', 'labels', 'boxes'],

    # instance segmentation result for single sample
    #   {
    #       "masks": [
    #           np.array in bgr channel order
    #       ],
    #       "labels": ["dog", "horse", "cow", "cat"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.image_segmentation: ['scores', 'labels', 'boxes'],

    # image generation/editing/matting result for single sample
    # {
    #   "output_png": np.array with shape(h, w, 4)
    #                 for matting or (h, w, 3) for general purpose
    # }
    Tasks.image_editing: ['output_png'],
    Tasks.image_matting: ['output_png'],
    Tasks.image_generation: ['output_png'],

    # pose estimation result for single sample
    # {
    #   "poses": np.array with shape [num_pose, num_keypoint, 3],
    #       each keypoint is a array [x, y, score]
    #   "boxes": np.array with shape [num_pose, 4], each box is
    #       [x1, y1, x2, y2]
    # }
    Tasks.pose_estimation: ['poses', 'boxes'],

    # ============ nlp tasks ===================

    # text classification result for single sample
    #   {
    #       "labels": ["happy", "sad", "calm", "angry"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.text_classification: ['scores', 'labels'],

    # text generation result for single sample
    # {
    #   "text": "this is text generated by a model."
    # }
    Tasks.text_generation: ['text'],

    # word segmentation result for single sample
    # {
    #   "output": "今天 天气 不错 ， 适合 出去 游玩"
    # }
    Tasks.word_segmentation: ['output'],

    # sentence similarity result for single sample
    #   {
    #       "labels": "1",
    #       "scores": 0.9
    #   }
    Tasks.sentence_similarity: ['scores', 'labels'],

    # ============ audio tasks ===================

    # audio processed for single file in PCM format
    # {
    #   "output_pcm": np.array with shape(samples,) and dtype float32
    # }
    Tasks.speech_signal_process: ['output_pcm'],

    # ============ multi-modal tasks ===================

    # image caption result for single sample
    # {
    #   "caption": "this is an image caption text."
    # }
    Tasks.image_captioning: ['caption'],

    # visual grounding result for single sample
    # {
    #       "boxes": [
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #           [x1, y1, x2, y2],
    #       ],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    # }
    Tasks.visual_grounding: ['boxes', 'scores'],

    # text_to_image result for a single sample
    # {
    #    "image": np.ndarray with shape [height, width, 3]
    # }
    Tasks.text_to_image_synthesis: ['image']
 }
--- a/modelscope/pipelines/util.py
+++ b/modelscope/pipelines/util.py
@@ -1,12 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import os.path as osp
 from typing import List, Union

 import json
 from maas_hub.file_download import model_file_download

 from modelscope.utils.constant import CONFIGFILE
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 def is_config_has_model(cfg_file):
    try:
        cfg = Config.from_file(cfg_file)
        return hasattr(cfg, 'model')
    except Exception as e:
        logger.error(f'parse config file {cfg_file} failed: {e}')
        return False


 def is_model_name(model: Union[str, List]):
@@ -15,24 +26,17 @@ def is_model_name(model: Union[str, List]):

    def is_model_name_impl(model):
        if osp.exists(model):
            if osp.exists(osp.join(model, CONFIGFILE)):
                return True
            cfg_file = osp.join(model, ModelFile.CONFIGURATION)
            if osp.exists(cfg_file):
                return is_config_has_model(cfg_file)
            else:
                return False
        else:
            # try:
            #     cfg_file = model_file_download(model, CONFIGFILE)
            # except Exception:
            #     cfg_file = None
            # TODO @wenmeng.zwm use exception instead of
            # following tricky logic
            cfg_file = model_file_download(model, CONFIGFILE)
            with open(cfg_file, 'r') as infile:
                cfg = json.load(infile)
            if 'Code' in cfg:
            try:
                cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
                return is_config_has_model(cfg_file)
            except Exception:
                return False
            else:
                return True

    if isinstance(model, str):
        return is_model_name_impl(model)
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -1,10 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .audio import LinearAECAndFbank
 from .base import Preprocessor
 from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
 from .image import LoadImage, load_image
 from .nlp import *  # noqa F403
 from .nlp import TextGenerationPreprocessor
 from .space.dialog_intent_prediction_preprocessor import *  # noqa F403
 from .space.dialog_modeling_preprocessor import *  # noqa F403
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -0,0 +1,230 @@
 import ctypes
 import os
 from typing import Any, Dict

 import numpy as np
 import scipy.io.wavfile as wav
 import torch
 import torchaudio.compliance.kaldi as kaldi
 from numpy.ctypeslib import ndpointer

 from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS


 def load_wav(path):
    samp_rate, data = wav.read(path)
    return np.float32(data), samp_rate


 def load_library(libaec):
    libaec_in_cwd = os.path.join('.', libaec)
    if os.path.exists(libaec_in_cwd):
        libaec = libaec_in_cwd
    mitaec = ctypes.cdll.LoadLibrary(libaec)
    fe_process = mitaec.fe_process_inst
    fe_process.argtypes = [
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int,
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'),
        ndpointer(ctypes.c_float, flags='C_CONTIGUOUS')
    ]
    return fe_process


 def do_linear_aec(fe_process, mic, ref, int16range=True):
    mic = np.float32(mic)
    ref = np.float32(ref)
    if len(mic) > len(ref):
        mic = mic[:len(ref)]
    out_mic = np.zeros_like(mic)
    out_linear = np.zeros_like(mic)
    out_echo = np.zeros_like(mic)
    out_ref = np.zeros_like(mic)
    if int16range:
        mic /= 32768
        ref /= 32768
    fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo)
    # out_ref not in use here
    if int16range:
        out_mic *= 32768
        out_linear *= 32768
        out_echo *= 32768
    return out_mic, out_ref, out_linear, out_echo


 def load_kaldi_feature_transform(filename):
    fp = open(filename, 'r')
    all_str = fp.read()
    pos1 = all_str.find('AddShift')
    pos2 = all_str.find('[', pos1)
    pos3 = all_str.find(']', pos2)
    mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
    pos1 = all_str.find('Rescale')
    pos2 = all_str.find('[', pos1)
    pos3 = all_str.find(']', pos2)
    scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ')
    fp.close()
    return mean, scale


 class Feature:
    r"""Extract feat from one utterance.
    """

    def __init__(self,
                 fbank_config,
                 feat_type='spec',
                 mvn_file=None,
                 cuda=False):
        r"""

        Args:
            fbank_config (dict):
            feat_type (str):
                raw: do nothing
                fbank: use kaldi.fbank
                spec: Real/Imag
                logpow: log(1+|x|^2)
            mvn_file (str): the path of data file for mean variance normalization
            cuda:
        """
        self.fbank_config = fbank_config
        self.feat_type = feat_type
        self.n_fft = fbank_config['frame_length'] * fbank_config[
            'sample_frequency'] // 1000
        self.hop_length = fbank_config['frame_shift'] * fbank_config[
            'sample_frequency'] // 1000
        self.window = torch.hamming_window(self.n_fft, periodic=False)

        self.mvn = False
        if mvn_file is not None and os.path.exists(mvn_file):
            print(f'loading mvn file: {mvn_file}')
            shift, scale = load_kaldi_feature_transform(mvn_file)
            self.shift = torch.from_numpy(shift)
            self.scale = torch.from_numpy(scale)
            self.mvn = True
        if cuda:
            self.window = self.window.cuda()
            if self.mvn:
                self.shift = self.shift.cuda()
                self.scale = self.scale.cuda()

    def compute(self, utt):
        r"""

        Args:
            utt: in [-32768, 32767] range

        Returns:
             [..., T, F]
        """
        if self.feat_type == 'raw':
            return utt
        elif self.feat_type == 'fbank':
            if len(utt.shape) == 1:
                utt = utt.unsqueeze(0)
            feat = kaldi.fbank(utt, **self.fbank_config)
        elif self.feat_type == 'spec':
            spec = torch.stft(
                utt / 32768,
                self.n_fft,
                self.hop_length,
                self.n_fft,
                self.window,
                center=False,
                return_complex=True)
            feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2)
        elif self.feat_type == 'logpow':
            spec = torch.stft(
                utt,
                self.n_fft,
                self.hop_length,
                self.n_fft,
                self.window,
                center=False,
                return_complex=True)
            abspow = torch.abs(spec)**2
            feat = torch.log(1 + abspow).permute(-1, -2)
        return feat

    def normalize(self, feat):
        if self.mvn:
            feat = feat + self.shift
            feat = feat * self.scale
        return feat


@PREPROCESSORS.register_module(Fields.audio)
 class LinearAECAndFbank:
    SAMPLE_RATE = 16000

    def __init__(self, io_config):
        self.trunc_length = 7200 * self.SAMPLE_RATE
        self.linear_aec_delay = io_config['linear_aec_delay']
        self.feature = Feature(io_config['fbank_config'],
                               io_config['feat_type'], io_config['mvn'])
        self.mitaec = load_library(io_config['mitaec_library'])
        self.mask_on_mic = io_config['mask_on'] == 'nearend_mic'

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """ linear filtering the near end mic and far end audio, then extract the feature
        :param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech"
        :return: dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
        """
        # read files
        nearend_mic, fs = load_wav(data['nearend_mic'])
        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
        farend_speech, fs = load_wav(data['farend_speech'])
        assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
        if 'nearend_speech' in data:
            nearend_speech, fs = load_wav(data['nearend_speech'])
            assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}'
        else:
            nearend_speech = np.zeros_like(nearend_mic)

        out_mic, out_ref, out_linear, out_echo = do_linear_aec(
            self.mitaec, nearend_mic, farend_speech)
        # fix 20ms linear aec delay by delaying the target speech
        extra_zeros = np.zeros([int(self.linear_aec_delay * fs)])
        nearend_speech = np.concatenate([extra_zeros, nearend_speech])
        # truncate files to the same length
        flen = min(
            len(out_mic), len(out_ref), len(out_linear), len(out_echo),
            len(nearend_speech))
        fstart = 0
        flen = min(flen, self.trunc_length)
        nearend_mic, out_ref, out_linear, out_echo, nearend_speech = (
            out_mic[fstart:flen], out_ref[fstart:flen],
            out_linear[fstart:flen], out_echo[fstart:flen],
            nearend_speech[fstart:flen])

        # extract features (frames, [mic, linear, ref, aes?])
        feat = torch.FloatTensor()

        nearend_mic = torch.from_numpy(np.float32(nearend_mic))
        fbank_nearend_mic = self.feature.compute(nearend_mic)
        feat = torch.cat([feat, fbank_nearend_mic], dim=1)

        out_linear = torch.from_numpy(np.float32(out_linear))
        fbank_out_linear = self.feature.compute(out_linear)
        feat = torch.cat([feat, fbank_out_linear], dim=1)

        out_echo = torch.from_numpy(np.float32(out_echo))
        fbank_out_echo = self.feature.compute(out_echo)
        feat = torch.cat([feat, fbank_out_echo], dim=1)

        # feature transform
        feat = self.feature.normalize(feat)

        # prepare target
        if nearend_speech is not None:
            nearend_speech = torch.from_numpy(np.float32(nearend_speech))

        if self.mask_on_mic:
            base = nearend_mic
        else:
            base = out_linear
        out_data = {'base': base, 'target': nearend_speech, 'feature': feat}
        return out_data
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -9,7 +9,7 @@ from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS


@PREPROCESSORS.register_module(Fields.image)
@PREPROCESSORS.register_module(Fields.cv)
 class LoadImage:
    """Load an image from file or url.
    Added or updated keys are "filename", "img", "img_shape",
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -11,8 +11,8 @@ from .base import Preprocessor
 from .builder import PREPROCESSORS

 __all__ = [
    'Tokenize',
    'SequenceClassificationPreprocessor',
    'Tokenize', 'SequenceClassificationPreprocessor',
    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
 ]


@@ -31,7 +31,7 @@ class Tokenize(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=r'bert-sentiment-analysis')
    Fields.nlp, module_name=r'bert-sequence-classification')
 class SequenceClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
@@ -51,21 +51,42 @@ class SequenceClassificationPreprocessor(Preprocessor):
        self.sequence_length = kwargs.pop('sequence_length', 128)

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
        print(f'this is the tokenzier {self.tokenizer}')

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
    @type_assert(object, (str, tuple))
    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'
            data (str or tuple):
            sentence1 (str): a sentence
                    Example:
                        'you are so handsome.'
            or
            (sentence1, sentence2)
                sentence1 (str): a sentence
                    Example:
                        'you are so handsome.'
                sentence2 (str): a sentence
                    Example:
                        'you are so beautiful.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """

        new_data = {self.first_sequence: data}
        if not isinstance(data, tuple):
            data = (
                data,
                None,
            )

        sentence1, sentence2 = data
        new_data = {
            self.first_sequence: sentence1,
            self.second_sequence: sentence2
        }

        # preprocess the data for the model input

        rst = {
@@ -94,17 +115,15 @@ class SequenceClassificationPreprocessor(Preprocessor):
        return rst


@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm')
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
 class TextGenerationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
    def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
        """preprocess the data using the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """
        from sofa import PalmTokenizer

        super().__init__(*args, **kwargs)

        self.model_dir: str = model_dir
@@ -113,7 +132,7 @@ class TextGenerationPreprocessor(Preprocessor):
        self.second_sequence: str = kwargs.pop('second_sequence',
                                               'second_sequence')
        self.sequence_length: int = kwargs.pop('sequence_length', 128)
        self.tokenizer = PalmTokenizer.from_pretrained(model_dir)
        self.tokenizer = tokenizer

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
@@ -132,7 +151,7 @@ class TextGenerationPreprocessor(Preprocessor):
        new_data = {self.first_sequence: data}
        # preprocess the data for the model input

        rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
        rst = {'input_ids': [], 'attention_mask': []}

        max_seq_length = self.sequence_length

@@ -147,6 +166,53 @@ class TextGenerationPreprocessor(Preprocessor):

        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        rst['token_type_ids'].append(feature['token_type_ids'])

        return {k: torch.tensor(v) for k, v in rst.items()}


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=r'bert-token-classification')
 class TokenClassifcationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """

        super().__init__(*args, **kwargs)

        from sofa import SbertTokenizer
        self.model_dir: str = model_dir
        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """
        # preprocess the data for the model input

        text = data.replace(' ', '').strip()
        tokens = []
        for token in text:
            token = self.tokenizer.tokenize(token)
            tokens.extend(token)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
        attention_mask = [1] * len(input_ids)
        token_type_ids = [0] * len(input_ids)
        return {
            'text': text,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -74,17 +74,17 @@ class Config:
        {'c': [1, 2, 3], 'd': 'dd'}
        >>> cfg.b.d
        'dd'
        >>> cfg = Config.from_file('configs/examples/config.json')
        >>> cfg = Config.from_file('configs/examples/configuration.json')
        >>> cfg.filename
       'configs/examples/config.json'
       'configs/examples/configuration.json'
        >>> cfg.b
        {'c': [1, 2, 3], 'd': 'dd'}
        >>> cfg = Config.from_file('configs/examples/config.py')
        >>> cfg = Config.from_file('configs/examples/configuration.py')
        >>> cfg.filename
        "configs/examples/config.py"
        >>> cfg = Config.from_file('configs/examples/config.yaml')
        "configs/examples/configuration.py"
        >>> cfg = Config.from_file('configs/examples/configuration.yaml')
        >>> cfg.filename
        "configs/examples/config.yaml"
        "configs/examples/configuration.yaml"
    """

    @staticmethod
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -4,8 +4,8 @@
 class Fields(object):
    """ Names for different application fields
    """
    image = 'image'
    video = 'video'
    # image = 'image'
    # video = 'video'
    cv = 'cv'
    nlp = 'nlp'
    audio = 'audio'
@@ -30,7 +30,9 @@ class Tasks(object):
    image_matting = 'image-matting'

    # nlp tasks
    word_segmentation = 'word-segmentation'
    sentiment_analysis = 'sentiment-analysis'
    sentence_similarity = 'sentence-similarity'
    text_classification = 'text-classification'
    relation_extraction = 'relation-extraction'
    zero_shot = 'zero-shot'
@@ -52,7 +54,7 @@ class Tasks(object):
    text_to_speech = 'text-to-speech'
    speech_signal_process = 'speech-signal-process'

    # multi-media
    # multi-modal tasks
    image_captioning = 'image-captioning'
    visual_grounding = 'visual-grounding'
    text_to_image_synthesis = 'text-to-image-synthesis'
@@ -73,16 +75,16 @@ class Hubs(object):
    huggingface = 'huggingface'


 # configuration filename
 # in order to avoid conflict with huggingface
 # config file we use maas_config instead
 CONFIGFILE = 'maas_config.json'
 class ModelFile(object):
    CONFIGURATION = 'configuration.json'
    README = 'README.md'
    TF_SAVED_MODEL_FILE = 'saved_model.pb'
    TF_GRAPH_FILE = 'tf_graph.pb'
    TF_CHECKPOINT_FOLDER = 'tf_ckpts'
    TF_CKPT_PREFIX = 'ckpt-'
    TORCH_MODEL_FILE = 'pytorch_model.pt'
    TORCH_MODEL_BIN_FILE = 'pytorch_model.bin'


 README_FILE = 'README.md'
 TF_SAVED_MODEL_FILE = 'saved_model.pb'
 TF_GRAPH_FILE = 'tf_graph.pb'
 TF_CHECKPOINT_FOLDER = 'tf_ckpts'
 TF_CHECKPOINT_FILE = 'checkpoint'
 TORCH_MODEL_FILE = 'pytorch_model.bin'
 TENSORFLOW = 'tensorflow'
 PYTORCH = 'pytorch'
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import inspect
 from email.policy import default

 from modelscope.utils.logger import get_logger

@@ -70,6 +69,7 @@ class Registry(object):
                           f'{self._name}[{group_key}]')

        self._modules[group_key][module_name] = module_cls
        module_cls.group_key = group_key

        if module_name in self._modules[default_group]:
            if id(self._modules[default_group][module_name]) == id(module_cls):
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -0,0 +1,20 @@
 #!/usr/bin/env python
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os

 TEST_LEVEL = 2
 TEST_LEVEL_STR = 'TEST_LEVEL'


 def test_level():
    global TEST_LEVEL
    if TEST_LEVEL_STR in os.environ:
        TEST_LEVEL = int(os.environ[TEST_LEVEL_STR])

    return TEST_LEVEL


 def set_test_level(level: int):
    global TEST_LEVEL
    TEST_LEVEL = level
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,6 +1,7 @@
 docutils==0.16.0
 recommonmark
 sphinx==4.0.2
 sphinx-book-theme
 sphinx-copybutton
 sphinx_markdown_tables
 sphinx_rtd_theme==0.5.2
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1 +1 @@
 https://alinlp.alibaba-inc.com/pypi/sofa-1.0.1.3-py3-none-any.whl
 https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,12 +1,13 @@
 addict
 datasets
 easydict
 https://maashub.oss-cn-hangzhou.aliyuncs.com/releases/maas_hub-0.1.0.dev0-py2.py3-none-any.whl
 https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
 numpy
 opencv-python-headless
 Pillow
 Pillow>=6.2.0
 pyyaml
 requests
 scipy
 tokenizers<=0.10.3
 transformers<=4.16.2
 yapf
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,6 +11,7 @@ default_section = THIRDPARTY
 BASED_ON_STYLE = pep8
 BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
 SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
 SPLIT_BEFORE_ARITHMETIC_OPERATOR = true

 [codespell]
 skip = *.ipynb
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 [flake8]
 select = B,C,E,F,P,T4,W,B9
 max-line-length = 120
 ignore = F401,F821
 ignore = F401,F821,W503
 exclude = docs/src,*.pyi,.git
--- a/tests/pipelines/test_base.py
+++ b/tests/pipelines/test_base.py
@@ -35,9 +35,10 @@ class CustomPipelineTest(unittest.TestCase):
            CustomPipeline1()

    def test_custom(self):
        dummy_task = 'dummy-task'

        @PIPELINES.register_module(
            group_key=Tasks.image_tagging, module_name='custom-image')
            group_key=dummy_task, module_name='custom-image')
        class CustomImagePipeline(Pipeline):

            def __init__(self,
@@ -67,32 +68,28 @@ class CustomPipelineTest(unittest.TestCase):
                    outputs['filename'] = inputs['url']
                img = inputs['img']
                new_image = img.resize((img.width // 2, img.height // 2))
                outputs['resize_image'] = np.array(new_image)
                outputs['dummy_result'] = 'dummy_result'
                outputs['output_png'] = np.array(new_image)
                return outputs

            def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
                return inputs

        self.assertTrue('custom-image' in PIPELINES.modules[default_group])
        add_default_pipeline_info(Tasks.image_tagging, 'custom-image')
        add_default_pipeline_info(dummy_task, 'custom-image', overwrite=True)
        pipe = pipeline(pipeline_name='custom-image')
        pipe2 = pipeline(Tasks.image_tagging)
        pipe2 = pipeline(dummy_task)
        self.assertTrue(type(pipe) is type(pipe2))

        img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \
                  'aliyuncs.com/data/test/images/image1.jpg'
        img_url = 'data/test/images/image1.jpg'
        output = pipe(img_url)
        self.assertEqual(output['filename'], img_url)
        self.assertEqual(output['resize_image'].shape, (318, 512, 3))
        self.assertEqual(output['dummy_result'], 'dummy_result')
        self.assertEqual(output['output_png'].shape, (318, 512, 3))

        outputs = pipe([img_url for i in range(4)])
        self.assertEqual(len(outputs), 4)
        for out in outputs:
            self.assertEqual(out['filename'], img_url)
            self.assertEqual(out['resize_image'].shape, (318, 512, 3))
            self.assertEqual(out['dummy_result'], 'dummy_result')
            self.assertEqual(out['output_png'].shape, (318, 512, 3))


 if __name__ == '__main__':
--- a/tests/pipelines/test_image_captioning.py
+++ b/tests/pipelines/test_image_captioning.py
@@ -7,11 +7,12 @@ import unittest
 from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level


 class ImageCaptionTest(unittest.TestCase):

    @unittest.skip('skip long test')
    @unittest.skip('skip before model is restored in model hub')
    def test_run(self):
        model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt'

@@ -26,9 +27,7 @@ class ImageCaptionTest(unittest.TestCase):
            img_captioning = pipeline(
                Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir)

            result = img_captioning(
                'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
            )
            result = img_captioning('data/test/images/image_matting.png')
            print(result['caption'])


--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -9,14 +9,15 @@ import cv2
 from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.constant import Tasks
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level


 class ImageMattingTest(unittest.TestCase):

    def setUp(self) -> None:
        self.model_id = 'damo/cv_unet_image-matting_damo'
        self.model_id = 'damo/cv_unet_image-matting'
        # switch to False if downloading everytime is not desired
        purge_cache = True
        if purge_cache:
@@ -28,20 +29,17 @@ class ImageMattingTest(unittest.TestCase):
        model_path = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs' \
                     '.com/data/test/maas/image_matting/matting_person.pb'
        with tempfile.TemporaryDirectory() as tmp_dir:
            model_file = osp.join(tmp_dir, 'matting_person.pb')
            model_file = osp.join(tmp_dir, ModelFile.TF_GRAPH_FILE)
            with open(model_file, 'wb') as ofile:
                ofile.write(File.read(model_path))
            img_matting = pipeline(Tasks.image_matting, model=tmp_dir)

            result = img_matting(
                'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
            )
            result = img_matting('data/test/images/image_matting.png')
            cv2.imwrite('result.png', result['output_png'])

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_dataset(self):
        input_location = [
            'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
        ]
        input_location = ['data/test/images/image_matting.png']
        # alternatively:
        # input_location = '/dir/to/images'

@@ -52,21 +50,19 @@ class ImageMattingTest(unittest.TestCase):
        cv2.imwrite('result.png', next(result)['output_png'])
        print(f'Output written to {osp.abspath("result.png")}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub(self):
        img_matting = pipeline(Tasks.image_matting, model=self.model_id)

        result = img_matting(
            'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
        )
        result = img_matting('data/test/images/image_matting.png')
        cv2.imwrite('result.png', result['output_png'])
        print(f'Output written to {osp.abspath("result.png")}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_modelhub_default_model(self):
        img_matting = pipeline(Tasks.image_matting)

        result = img_matting(
            'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/test/maas/image_matting/test.png'
        )
        result = img_matting('data/test/images/image_matting.png')
        cv2.imwrite('result.png', result['output_png'])
        print(f'Output written to {osp.abspath("result.png")}')

--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -8,6 +8,7 @@ import cv2
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level


 class ImageCartoonTest(unittest.TestCase):
@@ -36,10 +37,12 @@ class ImageCartoonTest(unittest.TestCase):
        img_cartoon = pipeline(Tasks.image_generation, model=model_dir)
        self.pipeline_inference(img_cartoon, self.test_image)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_modelhub(self):
        img_cartoon = pipeline(Tasks.image_generation, model=self.model_id)
        self.pipeline_inference(img_cartoon, self.test_image)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_modelhub_default_model(self):
        img_cartoon = pipeline(Tasks.image_generation)
        self.pipeline_inference(img_cartoon, self.test_image)
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -0,0 +1,67 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import shutil
 import unittest

 from maas_hub.snapshot_download import snapshot_download

 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSentenceSimilarity
 from modelscope.pipelines import SentenceSimilarityPipeline, pipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level


 class SentenceSimilarityTest(unittest.TestCase):
    model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
    sentence1 = '今天气温比昨天高么？'
    sentence2 = '今天湿度比昨天高么？'

    def setUp(self) -> None:
        # switch to False if downloading everytime is not desired
        purge_cache = True
        if purge_cache:
            shutil.rmtree(
                get_model_cache_dir(self.model_id), ignore_errors=True)

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run(self):
        cache_path = snapshot_download(self.model_id)
        tokenizer = SequenceClassificationPreprocessor(cache_path)
        model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
        pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
        print('test1')
        print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
              f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
        print()
        print(
            f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
            f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = SequenceClassificationPreprocessor(model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.sentence_similarity,
            model=model,
            preprocessor=tokenizer)
        print(pipeline_ins(input=(self.sentence1, self.sentence2)))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.sentence_similarity, model=self.model_id)
        print(pipeline_ins(input=(self.sentence1, self.sentence2)))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.sentence_similarity)
        print(pipeline_ins(input=(self.sentence1, self.sentence2)))


 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -0,0 +1,56 @@
 import os.path
 import shutil
 import unittest

 from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import get_model_cache_dir

 NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
 FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
 NEAREND_MIC_FILE = 'nearend_mic.wav'
 FAREND_SPEECH_FILE = 'farend_speech.wav'

 AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Flib%2Flibmitaec_pyio.so' \
              '?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D'
 AEC_LIB_FILE = 'libmitaec_pyio.so'


 def download(remote_path, local_path):
    local_dir = os.path.dirname(local_path)
    if len(local_dir) > 0:
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)
    with open(local_path, 'wb') as ofile:
        ofile.write(File.read(remote_path))


 class SpeechSignalProcessTest(unittest.TestCase):

    def setUp(self) -> None:
        self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
        # switch to False if downloading everytime is not desired
        purge_cache = True
        if purge_cache:
            shutil.rmtree(
                get_model_cache_dir(self.model_id), ignore_errors=True)
        # A temporary hack to provide c++ lib. Download it first.
        download(AEC_LIB_URL, AEC_LIB_FILE)

    def test_run(self):
        download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
        download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
        input = {
            'nearend_mic': NEAREND_MIC_FILE,
            'farend_speech': FAREND_SPEECH_FILE
        }
        aec = pipeline(
            Tasks.speech_signal_process,
            model=self.model_id,
            pipeline_name=r'speech_dfsmn_aec_psm_16k')
        aec(input, output_path='output.wav')


 if __name__ == '__main__':
    unittest.main()
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -12,6 +12,7 @@ from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.constant import Hubs, Tasks
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level


 class SequenceClassificationTest(unittest.TestCase):
@@ -43,6 +44,7 @@ class SequenceClassificationTest(unittest.TestCase):
                break
            print(r)

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run(self):
        model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \
                    '/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip'
@@ -67,6 +69,7 @@ class SequenceClassificationTest(unittest.TestCase):
            Tasks.text_classification, model=model, preprocessor=preprocessor)
        print(pipeline2('Hello world!'))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = SequenceClassificationPreprocessor(
@@ -77,6 +80,7 @@ class SequenceClassificationTest(unittest.TestCase):
            preprocessor=preprocessor)
        self.predict(pipeline_ins)

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        text_classification = pipeline(
            task=Tasks.text_classification, model=self.model_id)
@@ -85,6 +89,7 @@ class SequenceClassificationTest(unittest.TestCase):
                'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
        self.printDataset(result)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_default_model(self):
        text_classification = pipeline(task=Tasks.text_classification)
        result = text_classification(
@@ -92,6 +97,7 @@ class SequenceClassificationTest(unittest.TestCase):
                'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
        self.printDataset(result)

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_dataset(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = SequenceClassificationPreprocessor(
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -4,47 +4,75 @@ import unittest
 from maas_hub.snapshot_download import snapshot_download

 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGenerationModel
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.pipelines import TextGenerationPipeline, pipeline
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level


 class TextGenerationTest(unittest.TestCase):
    model_id = 'damo/nlp_palm_text-generation_chinese'
    input1 = "今日天气类型='晴'&温度变化趋势='大幅上升'&最低气温='28℃'&最高气温='31℃'&体感='湿热'"
    input2 = "今日天气类型='多云'&体感='舒适'&最低气温='26℃'&最高气温='30℃'"
    model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
    model_id_en = 'damo/nlp_palm2.0_text-generation_english-base'
    input_zh = """
    本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：
    1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代
    """
    input_en = """
    The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started
    her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders ,
    54 , sparked outrage last week when she decided the 86-year-old should not face astring of charges
    of paedophilia against nine children because he has dementia . Today , newly-released documents
    revealed damning evidence that abuse was covered up by police andsocial workers for more than 20 years .
    And now it has emerged Mrs Saunders ' law career got off to a flying start when she secured her
    pupillage -- a barrister 's training contract at 1 Garden Court Chambers in London in 1983 .
    """

    @unittest.skip('skip temporarily to save test time')
    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run(self):
        cache_path = snapshot_download(self.model_id)
        preprocessor = TextGenerationPreprocessor(
            cache_path, first_sequence='sentence', second_sequence=None)
        model = PalmForTextGenerationModel(
            cache_path, tokenizer=preprocessor.tokenizer)
        pipeline1 = TextGenerationPipeline(model, preprocessor)
        pipeline2 = pipeline(
            Tasks.text_generation, model=model, preprocessor=preprocessor)
        print(f'input: {self.input1}\npipeline1: {pipeline1(self.input1)}')
        print()
        print(f'input: {self.input2}\npipeline2: {pipeline2(self.input2)}')
        for model_id, input in ((self.model_id_zh, self.input_zh),
                                (self.model_id_en, self.input_en)):
            cache_path = snapshot_download(model_id)
            model = PalmForTextGeneration(cache_path)
            preprocessor = TextGenerationPreprocessor(
                cache_path,
                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
            pipeline1 = TextGenerationPipeline(model, preprocessor)
            pipeline2 = pipeline(
                Tasks.text_generation, model=model, preprocessor=preprocessor)
            print(
                f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
            )

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        preprocessor = TextGenerationPreprocessor(
            model.model_dir, first_sequence='sentence', second_sequence=None)
        pipeline_ins = pipeline(
            task=Tasks.text_generation, model=model, preprocessor=preprocessor)
        print(pipeline_ins(self.input1))
        for model_id, input in ((self.model_id_zh, self.input_zh),
                                (self.model_id_en, self.input_en)):
            model = Model.from_pretrained(model_id)
            preprocessor = TextGenerationPreprocessor(
                model.model_dir,
                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
            pipeline_ins = pipeline(
                task=Tasks.text_generation,
                model=model,
                preprocessor=preprocessor)
            print(pipeline_ins(input))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.text_generation, model=self.model_id)
        print(pipeline_ins(self.input2))
        for model_id, input in ((self.model_id_zh, self.input_zh),
                                (self.model_id_en, self.input_en)):
            pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
            print(pipeline_ins(input))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.text_generation)
        print(pipeline_ins(self.input2))
        print(pipeline_ins(self.input_zh))


 if __name__ == '__main__':
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -0,0 +1,62 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import shutil
 import unittest

 from maas_hub.snapshot_download import snapshot_download

 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.pipelines import WordSegmentationPipeline, pipeline
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level


 class WordSegmentationTest(unittest.TestCase):
    model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
    sentence = '今天天气不错，适合出去游玩'

    def setUp(self) -> None:
        # switch to False if downloading everytime is not desired
        purge_cache = True
        if purge_cache:
            shutil.rmtree(
                get_model_cache_dir(self.model_id), ignore_errors=True)

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_by_direct_model_download(self):
        cache_path = snapshot_download(self.model_id)
        tokenizer = TokenClassifcationPreprocessor(cache_path)
        model = StructBertForTokenClassification(
            cache_path, tokenizer=tokenizer)
        pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(f'sentence: {self.sentence}\n'
              f'pipeline1:{pipeline1(input=self.sentence)}')
        print()
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = TokenClassifcationPreprocessor(model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.word_segmentation, model=self.model_id)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.word_segmentation)
        print(pipeline_ins(input=self.sentence))


 if __name__ == '__main__':
    unittest.main()
--- a/tests/preprocessors/test_image.py
+++ b/tests/preprocessors/test_image.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import unittest

 import PIL

 from modelscope.preprocessors import load_image
 from modelscope.utils.logger import get_logger


 class ImagePreprocessorTest(unittest.TestCase):

    def test_load(self):
        img = load_image('data/test/images/image_matting.png')
        self.assertTrue(isinstance(img, PIL.Image.Image))
        self.assertEqual(img.size, (948, 533))


 if __name__ == '__main__':
    unittest.main()
--- a/tests/run.py
+++ b/tests/run.py
@@ -7,6 +7,11 @@ import sys
 import unittest
 from fnmatch import fnmatch

 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import set_test_level, test_level

 logger = get_logger()


 def gather_test_cases(test_dir, pattern, list_tests):
    case_list = []
@@ -49,5 +54,9 @@ if __name__ == '__main__':
        '--pattern', default='test_*.py', help='test file pattern')
    parser.add_argument(
        '--test_dir', default='tests', help='directory to be tested')
    parser.add_argument(
        '--level', default=0, help='2 -- all, 1 -- p1, 0 -- p0')
    args = parser.parse_args()
    set_test_level(args.level)
    logger.info(f'TEST LEVEL: {test_level()}')
    main(args)
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -1,11 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
 import os.path as osp
 import tempfile
 import unittest
 from pathlib import Path

 from modelscope.fileio import dump, load
 from modelscope.utils.config import Config

 obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
@@ -14,25 +11,25 @@ obj = {'a': 1, 'b': {'c': [1, 2, 3], 'd': 'dd'}}
 class ConfigTest(unittest.TestCase):

    def test_json(self):
        config_file = 'configs/examples/config.json'
        config_file = 'configs/examples/configuration.json'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])

    def test_yaml(self):
        config_file = 'configs/examples/config.yaml'
        config_file = 'configs/examples/configuration.yaml'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])

    def test_py(self):
        config_file = 'configs/examples/config.py'
        config_file = 'configs/examples/configuration.py'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])

    def test_dump(self):
        config_file = 'configs/examples/config.py'
        config_file = 'configs/examples/configuration.py'
        cfg = Config.from_file(config_file)
        self.assertEqual(cfg.a, 1)
        self.assertEqual(cfg.b, obj['b'])
@@ -53,7 +50,7 @@ class ConfigTest(unittest.TestCase):
                self.assertEqual(yaml_str, infile.read())

    def test_to_dict(self):
        config_file = 'configs/examples/config.json'
        config_file = 'configs/examples/configuration.json'
        cfg = Config.from_file(config_file)
        d = cfg.to_dict()
        print(d)