Conflicts: modelscope/models/nlp/__init__.py modelscope/pipelines/builder.py modelscope/pipelines/outputs.py modelscope/preprocessors/nlp.py requirements/nlp.txtmaster
@@ -0,0 +1,3 @@ | |||
*.png filter=lfs diff=lfs merge=lfs -text | |||
*.jpg filter=lfs diff=lfs merge=lfs -text | |||
*.mp4 filter=lfs diff=lfs merge=lfs -text |
@@ -24,6 +24,7 @@ wheels/ | |||
.installed.cfg | |||
*.egg | |||
/package | |||
/temp | |||
MANIFEST | |||
# PyInstaller | |||
@@ -104,7 +105,6 @@ venv.bak/ | |||
# mypy | |||
.mypy_cache/ | |||
data | |||
.vscode | |||
.idea | |||
@@ -124,3 +124,7 @@ replace.sh | |||
# Pytorch | |||
*.pth | |||
# audio | |||
*.wav |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:78094cc48fbcfd9b6d321fe13619ecc72b65e006fc1b4c4458409ade9979486d | |||
size 129862 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141 | |||
size 603621 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141 | |||
size 603621 |
@@ -0,0 +1,3 @@ | |||
version https://git-lfs.github.com/spec/v1 | |||
oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94 | |||
size 141149 |
@@ -91,6 +91,55 @@ make tests | |||
4. Daily regression tests will run all cases at 0 am each day using master branch. | |||
### 2.3 Test data storage | |||
As we need a lot of data for testing, including images, videos, models. We use git lfs | |||
to store those large files. | |||
1. install git-lfs | |||
for mac | |||
```bash | |||
brew install git-lfs | |||
git lfs install | |||
``` | |||
for centos, please download rpm from git-lfs github release [website](https://github.com/git-lfs/git-lfs/releases/tag/v3.2.0) | |||
```bash | |||
wget http://101374-public.oss-cn-hangzhou-zmf.aliyuncs.com/git-lfs-3.2.0-1.el7.x86_64.rpm | |||
sudo rpm -ivh git-lfs-3.2.0-1.el7.x86_64.rpm | |||
git lfs install | |||
``` | |||
for ubuntu | |||
```bash | |||
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash | |||
sudo apt-get install git-lfs | |||
git lfs install | |||
``` | |||
2. track your data type using git lfs, for example, to track png files | |||
```bash | |||
git lfs track "*.png" | |||
``` | |||
3. add your test files to `data/test/` folder, you can make directories if you need. | |||
```bash | |||
git add data/test/test.png | |||
``` | |||
4. commit your test data to remote branch | |||
```bash | |||
git commit -m "xxx" | |||
``` | |||
To pull data from remote repo, just as the same way you pull git files. | |||
```bash | |||
git pull origin branch_name | |||
``` | |||
## Code Review | |||
1. Run following command to create an aone CR, replace `TARGET_BRANCH` and `CR_NAME` with the one you want. | |||
@@ -29,3 +29,15 @@ reference: [https://huggingface.co/docs/tokenizers/installation#installation-fro | |||
> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. | |||
由于依赖库之间的版本不兼容,可能会存在版本冲突的情况,大部分情况下不影响正常运行。 | |||
### 3. 安装pytorch出现版本错误 | |||
> ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8 | |||
> ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0) | |||
> ERROR: No matching distribution found for torch==1.8.1+cu111 | |||
安装时使用如下命令: | |||
```shell | |||
pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt | |||
``` |
@@ -25,6 +25,10 @@ ModelScope Library目前支持tensorflow,pytorch两大深度学习框架进行 | |||
* [Pytorch安装指导](https://pytorch.org/get-started/locally/) | |||
* [Tensorflow安装指导](https://www.tensorflow.org/install/pip) | |||
部分第三方依赖库需要提前安装numpy | |||
``` | |||
pip install numpy | |||
``` | |||
## ModelScope library 安装 | |||
@@ -1,5 +1,8 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .audio.tts.am import SambertNetHifi16k | |||
from .audio.tts.vocoder import Hifigan16k | |||
from .base import Model | |||
from .builder import MODELS, build_model | |||
from .multi_model import OfaForImageCaptioning | |||
from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity |
@@ -0,0 +1,60 @@ | |||
import torch.nn as nn | |||
from .layer_base import LayerBase | |||
class RectifiedLinear(LayerBase): | |||
def __init__(self, input_dim, output_dim): | |||
super(RectifiedLinear, self).__init__() | |||
self.dim = input_dim | |||
self.relu = nn.ReLU() | |||
def forward(self, input): | |||
return self.relu(input) | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim) | |||
return re_str | |||
def load_kaldi_nnet(self, instr): | |||
return instr | |||
class LogSoftmax(LayerBase): | |||
def __init__(self, input_dim, output_dim): | |||
super(LogSoftmax, self).__init__() | |||
self.dim = input_dim | |||
self.ls = nn.LogSoftmax() | |||
def forward(self, input): | |||
return self.ls(input) | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<Softmax> %d %d\n' % (self.dim, self.dim) | |||
return re_str | |||
def load_kaldi_nnet(self, instr): | |||
return instr | |||
class Sigmoid(LayerBase): | |||
def __init__(self, input_dim, output_dim): | |||
super(Sigmoid, self).__init__() | |||
self.dim = input_dim | |||
self.sig = nn.Sigmoid() | |||
def forward(self, input): | |||
return self.sig(input) | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<Sigmoid> %d %d\n' % (self.dim, self.dim) | |||
return re_str | |||
def load_kaldi_nnet(self, instr): | |||
return instr |
@@ -0,0 +1,78 @@ | |||
import numpy as np | |||
import torch as th | |||
import torch.nn as nn | |||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number, | |||
to_kaldi_matrix) | |||
class AffineTransform(LayerBase): | |||
def __init__(self, input_dim, output_dim): | |||
super(AffineTransform, self).__init__() | |||
self.input_dim = input_dim | |||
self.output_dim = output_dim | |||
self.linear = nn.Linear(input_dim, output_dim) | |||
def forward(self, input): | |||
return self.linear(input) | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<AffineTransform> %d %d\n' % (self.output_dim, | |||
self.input_dim) | |||
re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n' | |||
linear_weights = self.state_dict()['linear.weight'] | |||
x = linear_weights.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
linear_bias = self.state_dict()['linear.bias'] | |||
x = linear_bias.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
return re_str | |||
def to_raw_nnet(self, fid): | |||
linear_weights = self.state_dict()['linear.weight'] | |||
x = linear_weights.squeeze().numpy() | |||
x.tofile(fid) | |||
linear_bias = self.state_dict()['linear.bias'] | |||
x = linear_bias.squeeze().numpy() | |||
x.tofile(fid) | |||
def load_kaldi_nnet(self, instr): | |||
output = expect_token_number( | |||
instr, | |||
'<LearnRateCoef>', | |||
) | |||
if output is None: | |||
raise Exception('AffineTransform format error for <LearnRateCoef>') | |||
instr, lr = output | |||
output = expect_token_number(instr, '<BiasLearnRateCoef>') | |||
if output is None: | |||
raise Exception( | |||
'AffineTransform format error for <BiasLearnRateCoef>') | |||
instr, lr = output | |||
output = expect_token_number(instr, '<MaxNorm>') | |||
if output is None: | |||
raise Exception('AffineTransform format error for <MaxNorm>') | |||
instr, lr = output | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('AffineTransform format error for parsing matrix') | |||
instr, mat = output | |||
print(mat.shape) | |||
self.linear.weight = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('AffineTransform format error for parsing matrix') | |||
instr, mat = output | |||
mat = np.squeeze(mat) | |||
self.linear.bias = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
return instr |
@@ -0,0 +1,178 @@ | |||
import numpy as np | |||
import torch as th | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number, | |||
to_kaldi_matrix) | |||
class DeepFsmn(LayerBase): | |||
def __init__(self, | |||
input_dim, | |||
output_dim, | |||
lorder=None, | |||
rorder=None, | |||
hidden_size=None, | |||
layer_norm=False, | |||
dropout=0): | |||
super(DeepFsmn, self).__init__() | |||
self.input_dim = input_dim | |||
self.output_dim = output_dim | |||
if lorder is None: | |||
return | |||
self.lorder = lorder | |||
self.rorder = rorder | |||
self.hidden_size = hidden_size | |||
self.layer_norm = layer_norm | |||
self.linear = nn.Linear(input_dim, hidden_size) | |||
self.norm = nn.LayerNorm(hidden_size) | |||
self.drop1 = nn.Dropout(p=dropout) | |||
self.drop2 = nn.Dropout(p=dropout) | |||
self.project = nn.Linear(hidden_size, output_dim, bias=False) | |||
self.conv1 = nn.Conv2d( | |||
output_dim, | |||
output_dim, [lorder, 1], [1, 1], | |||
groups=output_dim, | |||
bias=False) | |||
self.conv2 = nn.Conv2d( | |||
output_dim, | |||
output_dim, [rorder, 1], [1, 1], | |||
groups=output_dim, | |||
bias=False) | |||
def forward(self, input): | |||
f1 = F.relu(self.linear(input)) | |||
f1 = self.drop1(f1) | |||
if self.layer_norm: | |||
f1 = self.norm(f1) | |||
p1 = self.project(f1) | |||
x = th.unsqueeze(p1, 1) | |||
x_per = x.permute(0, 3, 2, 1) | |||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) | |||
yr = F.pad(x_per, [0, 0, 0, self.rorder]) | |||
yr = yr[:, :, 1:, :] | |||
out = x_per + self.conv1(y) + self.conv2(yr) | |||
out = self.drop2(out) | |||
out1 = out.permute(0, 3, 2, 1) | |||
return input + out1.squeeze() | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<UniDeepFsmn> %d %d\n'\ | |||
% (self.output_dim, self.input_dim) | |||
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n'\ | |||
% (1, self.hidden_size, self.lorder, 1) | |||
lfiters = self.state_dict()['conv1.weight'] | |||
x = np.flipud(lfiters.squeeze().numpy().T) | |||
re_str += to_kaldi_matrix(x) | |||
proj_weights = self.state_dict()['project.weight'] | |||
x = proj_weights.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
linear_weights = self.state_dict()['linear.weight'] | |||
x = linear_weights.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
linear_bias = self.state_dict()['linear.bias'] | |||
x = linear_bias.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
return re_str | |||
def load_kaldi_nnet(self, instr): | |||
output = expect_token_number( | |||
instr, | |||
'<LearnRateCoef>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <LearnRateCoef>') | |||
instr, lr = output | |||
output = expect_token_number( | |||
instr, | |||
'<HidSize>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <HidSize>') | |||
instr, hiddensize = output | |||
self.hidden_size = int(hiddensize) | |||
output = expect_token_number( | |||
instr, | |||
'<LOrder>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <LOrder>') | |||
instr, lorder = output | |||
self.lorder = int(lorder) | |||
output = expect_token_number( | |||
instr, | |||
'<LStride>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <LStride>') | |||
instr, lstride = output | |||
self.lstride = lstride | |||
output = expect_token_number( | |||
instr, | |||
'<MaxNorm>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <MaxNorm>') | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
mat1 = np.fliplr(mat.T).copy() | |||
self.conv1 = nn.Conv2d( | |||
self.output_dim, | |||
self.output_dim, [self.lorder, 1], [1, 1], | |||
groups=self.output_dim, | |||
bias=False) | |||
mat_th = th.from_numpy(mat1).type(th.FloatTensor) | |||
mat_th = mat_th.unsqueeze(1) | |||
mat_th = mat_th.unsqueeze(3) | |||
self.conv1.weight = th.nn.Parameter(mat_th) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False) | |||
self.linear = nn.Linear(self.input_dim, self.hidden_size) | |||
self.project.weight = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
self.linear.weight = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
self.linear.bias = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
return instr |
@@ -0,0 +1,50 @@ | |||
import abc | |||
import re | |||
import numpy as np | |||
import torch.nn as nn | |||
def expect_token_number(instr, token): | |||
first_token = re.match(r'^\s*' + token, instr) | |||
if first_token is None: | |||
return None | |||
instr = instr[first_token.end():] | |||
lr = re.match(r'^\s*(-?\d+\.?\d*e?-?\d*?)', instr) | |||
if lr is None: | |||
return None | |||
return instr[lr.end():], lr.groups()[0] | |||
def expect_kaldi_matrix(instr): | |||
pos2 = instr.find('[', 0) | |||
pos3 = instr.find(']', pos2) | |||
mat = [] | |||
for stt in instr[pos2 + 1:pos3].split('\n'): | |||
tmp_mat = np.fromstring(stt, dtype=np.float32, sep=' ') | |||
if tmp_mat.size > 0: | |||
mat.append(tmp_mat) | |||
return instr[pos3 + 1:], np.array(mat) | |||
def to_kaldi_matrix(np_mat): | |||
""" | |||
function that transform as str numpy mat to standard kaldi str matrix | |||
:param np_mat: numpy mat | |||
:return: str | |||
""" | |||
np.set_printoptions(threshold=np.inf, linewidth=np.nan, suppress=True) | |||
out_str = str(np_mat) | |||
out_str = out_str.replace('[', '') | |||
out_str = out_str.replace(']', '') | |||
return '[ %s ]\n' % out_str | |||
class LayerBase(nn.Module, metaclass=abc.ABCMeta): | |||
def __init__(self): | |||
super(LayerBase, self).__init__() | |||
@abc.abstractmethod | |||
def to_kaldi_nnet(self): | |||
pass |
@@ -0,0 +1,482 @@ | |||
import numpy as np | |||
import torch as th | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from .layer_base import (LayerBase, expect_kaldi_matrix, expect_token_number, | |||
to_kaldi_matrix) | |||
class SepConv(nn.Module): | |||
def __init__(self, | |||
in_channels, | |||
filters, | |||
out_channels, | |||
kernel_size=(5, 2), | |||
dilation=(1, 1)): | |||
""" :param kernel_size (time, frequency) | |||
""" | |||
super(SepConv, self).__init__() | |||
# depthwise + pointwise | |||
self.dconv = nn.Conv2d( | |||
in_channels, | |||
in_channels * filters, | |||
kernel_size, | |||
dilation=dilation, | |||
groups=in_channels) | |||
self.pconv = nn.Conv2d( | |||
in_channels * filters, out_channels, kernel_size=1) | |||
self.padding = dilation[0] * (kernel_size[0] - 1) | |||
def forward(self, input): | |||
''' input: [B, C, T, F] | |||
''' | |||
x = F.pad(input, [0, 0, self.padding, 0]) | |||
x = self.dconv(x) | |||
x = self.pconv(x) | |||
return x | |||
class Conv2d(nn.Module): | |||
def __init__(self, | |||
input_dim, | |||
output_dim, | |||
lorder=20, | |||
rorder=0, | |||
groups=1, | |||
bias=False, | |||
skip_connect=True): | |||
super(Conv2d, self).__init__() | |||
self.lorder = lorder | |||
self.conv = nn.Conv2d( | |||
input_dim, output_dim, [lorder, 1], groups=groups, bias=bias) | |||
self.rorder = rorder | |||
if self.rorder: | |||
self.conv2 = nn.Conv2d( | |||
input_dim, output_dim, [rorder, 1], groups=groups, bias=bias) | |||
self.skip_connect = skip_connect | |||
def forward(self, input): | |||
# [B, 1, T, F] | |||
x = th.unsqueeze(input, 1) | |||
# [B, F, T, 1] | |||
x_per = x.permute(0, 3, 2, 1) | |||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) | |||
out = self.conv(y) | |||
if self.rorder: | |||
yr = F.pad(x_per, [0, 0, 0, self.rorder]) | |||
yr = yr[:, :, 1:, :] | |||
out += self.conv2(yr) | |||
out = out.permute(0, 3, 2, 1).squeeze(1) | |||
if self.skip_connect: | |||
out = out + input | |||
return out | |||
class SelfAttLayer(nn.Module): | |||
def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None): | |||
super(SelfAttLayer, self).__init__() | |||
self.input_dim = input_dim | |||
self.output_dim = output_dim | |||
if lorder is None: | |||
return | |||
self.lorder = lorder | |||
self.hidden_size = hidden_size | |||
self.linear = nn.Linear(input_dim, hidden_size) | |||
self.project = nn.Linear(hidden_size, output_dim, bias=False) | |||
self.att = nn.Linear(input_dim, lorder, bias=False) | |||
def forward(self, input): | |||
f1 = F.relu(self.linear(input)) | |||
p1 = self.project(f1) | |||
x = th.unsqueeze(p1, 1) | |||
x_per = x.permute(0, 3, 2, 1) | |||
y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) | |||
# z [B, F, T, lorder] | |||
z = x_per | |||
for i in range(1, self.lorder): | |||
z = th.cat([z, y[:, :, self.lorder - 1 - i:-i, :]], axis=-1) | |||
# [B, T, lorder] | |||
att = F.softmax(self.att(input), dim=-1) | |||
att = th.unsqueeze(att, 1) | |||
z = th.sum(z * att, axis=-1) | |||
out1 = z.permute(0, 2, 1) | |||
return input + out1 | |||
class TFFsmn(nn.Module): | |||
def __init__(self, | |||
input_dim, | |||
output_dim, | |||
lorder=None, | |||
hidden_size=None, | |||
dilation=1, | |||
layer_norm=False, | |||
dropout=0, | |||
skip_connect=True): | |||
super(TFFsmn, self).__init__() | |||
self.skip_connect = skip_connect | |||
self.linear = nn.Linear(input_dim, hidden_size) | |||
self.norm = nn.Identity() | |||
if layer_norm: | |||
self.norm = nn.LayerNorm(input_dim) | |||
self.act = nn.ReLU() | |||
self.project = nn.Linear(hidden_size, output_dim, bias=False) | |||
self.conv1 = nn.Conv2d( | |||
output_dim, | |||
output_dim, [lorder, 1], | |||
dilation=[dilation, 1], | |||
groups=output_dim, | |||
bias=False) | |||
self.padding_left = dilation * (lorder - 1) | |||
dorder = 5 | |||
self.conv2 = nn.Conv2d(1, 1, [dorder, 1], bias=False) | |||
self.padding_freq = dorder - 1 | |||
def forward(self, input): | |||
return self.compute1(input) | |||
def compute1(self, input): | |||
''' linear-dconv-relu(norm)-linear-dconv | |||
''' | |||
x = self.linear(input) | |||
# [B, 1, F, T] | |||
x = th.unsqueeze(x, 1).permute(0, 1, 3, 2) | |||
z = F.pad(x, [0, 0, self.padding_freq, 0]) | |||
z = self.conv2(z) + x | |||
x = z.permute(0, 3, 2, 1).squeeze(-1) | |||
x = self.act(x) | |||
x = self.norm(x) | |||
x = self.project(x) | |||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) | |||
# [B, F, T+lorder-1, 1] | |||
y = F.pad(x, [0, 0, self.padding_left, 0]) | |||
out = self.conv1(y) | |||
if self.skip_connect: | |||
out = out + x | |||
out = out.permute(0, 3, 2, 1).squeeze() | |||
return input + out | |||
class CNNFsmn(nn.Module): | |||
''' use cnn to reduce parameters | |||
''' | |||
def __init__(self, | |||
input_dim, | |||
output_dim, | |||
lorder=None, | |||
hidden_size=None, | |||
dilation=1, | |||
layer_norm=False, | |||
dropout=0, | |||
skip_connect=True): | |||
super(CNNFsmn, self).__init__() | |||
self.input_dim = input_dim | |||
self.output_dim = output_dim | |||
self.skip_connect = skip_connect | |||
if lorder is None: | |||
return | |||
self.lorder = lorder | |||
self.hidden_size = hidden_size | |||
self.linear = nn.Linear(input_dim, hidden_size) | |||
self.act = nn.ReLU() | |||
kernel_size = (3, 8) | |||
stride = (1, 4) | |||
self.conv = nn.Sequential( | |||
nn.ConstantPad2d((stride[1], 0, kernel_size[0] - 1, 0), 0), | |||
nn.Conv2d(1, stride[1], kernel_size=kernel_size, stride=stride)) | |||
self.dconv = nn.Conv2d( | |||
output_dim, | |||
output_dim, [lorder, 1], | |||
dilation=[dilation, 1], | |||
groups=output_dim, | |||
bias=False) | |||
self.padding_left = dilation * (lorder - 1) | |||
def forward(self, input): | |||
return self.compute2(input) | |||
def compute1(self, input): | |||
''' linear-relu(norm)-conv2d-relu?-dconv | |||
''' | |||
# [B, T, F] | |||
x = self.linear(input) | |||
x = self.act(x) | |||
x = th.unsqueeze(x, 1) | |||
x = self.conv(x) | |||
# [B, C, T, F] -> [B, 1, T, F] | |||
b, c, t, f = x.shape | |||
x = x.view([b, 1, t, -1]) | |||
x = x.permute(0, 3, 2, 1) | |||
# [B, F, T+lorder-1, 1] | |||
y = F.pad(x, [0, 0, self.padding_left, 0]) | |||
out = self.dconv(y) | |||
if self.skip_connect: | |||
out = out + x | |||
out = out.permute(0, 3, 2, 1).squeeze() | |||
return input + out | |||
def compute2(self, input): | |||
''' conv2d-relu-linear-relu?-dconv | |||
''' | |||
x = th.unsqueeze(input, 1) | |||
x = self.conv(x) | |||
x = self.act(x) | |||
# [B, C, T, F] -> [B, T, F] | |||
b, c, t, f = x.shape | |||
x = x.view([b, t, -1]) | |||
x = self.linear(x) | |||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) | |||
y = F.pad(x, [0, 0, self.padding_left, 0]) | |||
out = self.dconv(y) | |||
if self.skip_connect: | |||
out = out + x | |||
out = out.permute(0, 3, 2, 1).squeeze() | |||
return input + out | |||
class UniDeepFsmn(LayerBase): | |||
def __init__(self, | |||
input_dim, | |||
output_dim, | |||
lorder=None, | |||
hidden_size=None, | |||
dilation=1, | |||
layer_norm=False, | |||
dropout=0, | |||
skip_connect=True): | |||
super(UniDeepFsmn, self).__init__() | |||
self.input_dim = input_dim | |||
self.output_dim = output_dim | |||
self.skip_connect = skip_connect | |||
if lorder is None: | |||
return | |||
self.lorder = lorder | |||
self.hidden_size = hidden_size | |||
self.linear = nn.Linear(input_dim, hidden_size) | |||
self.norm = nn.Identity() | |||
if layer_norm: | |||
self.norm = nn.LayerNorm(input_dim) | |||
self.act = nn.ReLU() | |||
self.project = nn.Linear(hidden_size, output_dim, bias=False) | |||
self.conv1 = nn.Conv2d( | |||
output_dim, | |||
output_dim, [lorder, 1], | |||
dilation=[dilation, 1], | |||
groups=output_dim, | |||
bias=False) | |||
self.padding_left = dilation * (lorder - 1) | |||
def forward(self, input): | |||
return self.compute1(input) | |||
def compute1(self, input): | |||
''' linear-relu(norm)-linear-dconv | |||
''' | |||
# [B, T, F] | |||
x = self.linear(input) | |||
x = self.act(x) | |||
x = self.norm(x) | |||
x = self.project(x) | |||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) | |||
# [B, F, T+lorder-1, 1] | |||
y = F.pad(x, [0, 0, self.padding_left, 0]) | |||
out = self.conv1(y) | |||
if self.skip_connect: | |||
out = out + x | |||
out = out.permute(0, 3, 2, 1).squeeze() | |||
return input + out | |||
def compute2(self, input): | |||
''' linear-dconv-linear-relu(norm) | |||
''' | |||
x = self.project(input) | |||
x = th.unsqueeze(x, 1).permute(0, 3, 2, 1) | |||
y = F.pad(x, [0, 0, self.padding_left, 0]) | |||
out = self.conv1(y) | |||
if self.skip_connect: | |||
out = out + x | |||
out = out.permute(0, 3, 2, 1).squeeze() | |||
x = self.linear(out) | |||
x = self.act(x) | |||
x = self.norm(x) | |||
return input + x | |||
def compute3(self, input): | |||
''' dconv-linear-relu(norm)-linear | |||
''' | |||
x = th.unsqueeze(input, 1).permute(0, 3, 2, 1) | |||
y = F.pad(x, [0, 0, self.padding_left, 0]) | |||
out = self.conv1(y) | |||
if self.skip_connect: | |||
out = out + x | |||
out = out.permute(0, 3, 2, 1).squeeze() | |||
x = self.linear(out) | |||
x = self.act(x) | |||
x = self.norm(x) | |||
x = self.project(x) | |||
return input + x | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<UniDeepFsmn> %d %d\n' \ | |||
% (self.output_dim, self.input_dim) | |||
re_str += '<LearnRateCoef> %d <HidSize> %d <LOrder> %d <LStride> %d <MaxNorm> 0\n' \ | |||
% (1, self.hidden_size, self.lorder, 1) | |||
lfiters = self.state_dict()['conv1.weight'] | |||
x = np.flipud(lfiters.squeeze().numpy().T) | |||
re_str += to_kaldi_matrix(x) | |||
proj_weights = self.state_dict()['project.weight'] | |||
x = proj_weights.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
linear_weights = self.state_dict()['linear.weight'] | |||
x = linear_weights.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
linear_bias = self.state_dict()['linear.bias'] | |||
x = linear_bias.squeeze().numpy() | |||
re_str += to_kaldi_matrix(x) | |||
return re_str | |||
def to_raw_nnet(self, fid): | |||
lfiters = self.state_dict()['conv1.weight'] | |||
x = np.flipud(lfiters.squeeze().numpy().T) | |||
x.tofile(fid) | |||
proj_weights = self.state_dict()['project.weight'] | |||
x = proj_weights.squeeze().numpy() | |||
x.tofile(fid) | |||
linear_weights = self.state_dict()['linear.weight'] | |||
x = linear_weights.squeeze().numpy() | |||
x.tofile(fid) | |||
linear_bias = self.state_dict()['linear.bias'] | |||
x = linear_bias.squeeze().numpy() | |||
x.tofile(fid) | |||
def load_kaldi_nnet(self, instr): | |||
output = expect_token_number( | |||
instr, | |||
'<LearnRateCoef>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <LearnRateCoef>') | |||
instr, lr = output | |||
output = expect_token_number( | |||
instr, | |||
'<HidSize>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <HidSize>') | |||
instr, hiddensize = output | |||
self.hidden_size = int(hiddensize) | |||
output = expect_token_number( | |||
instr, | |||
'<LOrder>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <LOrder>') | |||
instr, lorder = output | |||
self.lorder = int(lorder) | |||
output = expect_token_number( | |||
instr, | |||
'<LStride>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <LStride>') | |||
instr, lstride = output | |||
self.lstride = lstride | |||
output = expect_token_number( | |||
instr, | |||
'<MaxNorm>', | |||
) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for <MaxNorm>') | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
mat1 = np.fliplr(mat.T).copy() | |||
self.conv1 = nn.Conv2d( | |||
self.output_dim, | |||
self.output_dim, [self.lorder, 1], [1, 1], | |||
groups=self.output_dim, | |||
bias=False) | |||
mat_th = th.from_numpy(mat1).type(th.FloatTensor) | |||
mat_th = mat_th.unsqueeze(1) | |||
mat_th = mat_th.unsqueeze(3) | |||
self.conv1.weight = th.nn.Parameter(mat_th) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
self.project = nn.Linear(self.hidden_size, self.output_dim, bias=False) | |||
self.linear = nn.Linear(self.input_dim, self.hidden_size) | |||
self.project.weight = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
self.linear.weight = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
output = expect_kaldi_matrix(instr) | |||
if output is None: | |||
raise Exception('UniDeepFsmn format error for parsing matrix') | |||
instr, mat = output | |||
mat = np.squeeze(mat) | |||
self.linear.bias = th.nn.Parameter( | |||
th.from_numpy(mat).type(th.FloatTensor)) | |||
return instr |
@@ -0,0 +1,394 @@ | |||
import torch | |||
import torch.nn.functional as F | |||
from .modulation_loss import (GaborSTRFConv, MelScale, | |||
ModulationDomainLossModule) | |||
EPS = 1e-8 | |||
def compute_mask(mixed_spec, clean_spec, mask_type='psmiam', clip=1): | |||
''' | |||
stft: (batch, ..., 2) or complex(batch, ...) | |||
y = x + n | |||
''' | |||
if torch.is_complex(mixed_spec): | |||
yr, yi = mixed_spec.real, mixed_spec.imag | |||
else: | |||
yr, yi = mixed_spec[..., 0], mixed_spec[..., 1] | |||
if torch.is_complex(clean_spec): | |||
xr, xi = clean_spec.real, clean_spec.imag | |||
else: | |||
xr, xi = clean_spec[..., 0], clean_spec[..., 1] | |||
if mask_type == 'iam': | |||
ymag = torch.sqrt(yr**2 + yi**2) | |||
xmag = torch.sqrt(xr**2 + xi**2) | |||
iam = xmag / (ymag + EPS) | |||
return torch.clamp(iam, 0, 1) | |||
elif mask_type == 'psm': | |||
ypow = yr**2 + yi**2 | |||
psm = (xr * yr + xi * yi) / (ypow + EPS) | |||
return torch.clamp(psm, 0, 1) | |||
elif mask_type == 'psmiam': | |||
ypow = yr**2 + yi**2 | |||
psm = (xr * yr + xi * yi) / (ypow + EPS) | |||
ymag = torch.sqrt(yr**2 + yi**2) | |||
xmag = torch.sqrt(xr**2 + xi**2) | |||
iam = xmag / (ymag + EPS) | |||
psmiam = psm * iam | |||
return torch.clamp(psmiam, 0, 1) | |||
elif mask_type == 'crm': | |||
ypow = yr**2 + yi**2 | |||
mr = (xr * yr + xi * yi) / (ypow + EPS) | |||
mi = (xi * yr - xr * yi) / (ypow + EPS) | |||
mr = torch.clamp(mr, -clip, clip) | |||
mi = torch.clamp(mi, -clip, clip) | |||
return mr, mi | |||
def energy_vad(spec, | |||
thdhigh=320 * 600 * 600 * 2, | |||
thdlow=320 * 300 * 300 * 2, | |||
int16=True): | |||
''' | |||
energy based vad should be accurate enough | |||
spec: (batch, bins, frames, 2) | |||
returns (batch, frames) | |||
''' | |||
energy = torch.sum(spec[..., 0]**2 + spec[..., 1]**2, dim=1) | |||
vad = energy > thdhigh | |||
idx = torch.logical_and(vad == 0, energy > thdlow) | |||
vad[idx] = 0.5 | |||
return vad | |||
def modulation_loss_init(n_fft): | |||
gabor_strf_parameters = torch.load( | |||
'./network/gabor_strf_parameters.pt')['state_dict'] | |||
gabor_modulation_kernels = GaborSTRFConv(supn=30, supk=30, nkern=60) | |||
gabor_modulation_kernels.load_state_dict(gabor_strf_parameters) | |||
modulation_loss_module = ModulationDomainLossModule( | |||
gabor_modulation_kernels.eval()) | |||
for param in modulation_loss_module.parameters(): | |||
param.requires_grad = False | |||
stft2mel = MelScale( | |||
n_mels=80, sample_rate=16000, n_stft=n_fft // 2 + 1).cuda() | |||
return modulation_loss_module, stft2mel | |||
def mask_loss_function( | |||
loss_func='psm_loss', | |||
loss_type='mse', # ['mse', 'mae', 'comb'] | |||
mask_type='psmiam', | |||
use_mod_loss=False, | |||
use_wav2vec_loss=False, | |||
n_fft=640, | |||
hop_length=320, | |||
EPS=1e-8, | |||
weight=None): | |||
if weight is not None: | |||
print(f'Use loss weight: {weight}') | |||
winlen = n_fft | |||
window = torch.hamming_window(winlen, periodic=False) | |||
def stft(x, return_complex=False): | |||
# returns [batch, bins, frames, 2] | |||
return torch.stft( | |||
x, | |||
n_fft, | |||
hop_length, | |||
winlen, | |||
window=window.to(x.device), | |||
center=False, | |||
return_complex=return_complex) | |||
def istft(x, slen): | |||
return torch.istft( | |||
x, | |||
n_fft, | |||
hop_length, | |||
winlen, | |||
window=window.to(x.device), | |||
center=False, | |||
length=slen) | |||
def mask_loss(targets, masks, nframes): | |||
''' [Batch, Time, Frequency] | |||
''' | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(targets) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
masks = masks * mask_for_loss | |||
targets = targets * mask_for_loss | |||
if weight is None: | |||
alpha = 1 | |||
else: # for aec ST | |||
alpha = weight - targets | |||
if loss_type == 'mse': | |||
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2)) | |||
elif loss_type == 'mae': | |||
loss = torch.sum(alpha * torch.abs(targets - masks)) | |||
else: # mse(mask), mae(mask) approx 1:2 | |||
loss = 0.5 * torch.sum(alpha * torch.pow(targets - masks, 2) | |||
+ 0.1 * alpha * torch.abs(targets - masks)) | |||
loss /= torch.sum(nframes) | |||
return loss | |||
def spectrum_loss(targets, spec, nframes): | |||
''' [Batch, Time, Frequency, 2] | |||
''' | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(targets[..., 0]) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
xr = spec[..., 0] * mask_for_loss | |||
xi = spec[..., 1] * mask_for_loss | |||
yr = targets[..., 0] * mask_for_loss | |||
yi = targets[..., 1] * mask_for_loss | |||
xmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) * mask_for_loss | |||
ymag = torch.sqrt(targets[..., 0]**2 | |||
+ targets[..., 1]**2) * mask_for_loss | |||
loss1 = torch.sum(torch.pow(xr - yr, 2) + torch.pow(xi - yi, 2)) | |||
loss2 = torch.sum(torch.pow(xmag - ymag, 2)) | |||
loss = (loss1 + loss2) / torch.sum(nframes) | |||
return loss | |||
def sa_loss_dlen(mixed, clean, masks, nframes): | |||
yspec = stft(mixed).permute([0, 2, 1, 3]) / 32768 | |||
xspec = stft(clean).permute([0, 2, 1, 3]) / 32768 | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(xspec[..., 0]) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
emag = ((yspec[..., 0]**2 + yspec[..., 1]**2)**0.15) * (masks**0.3) | |||
xmag = (xspec[..., 0]**2 + xspec[..., 1]**2)**0.15 | |||
emag = emag * mask_for_loss | |||
xmag = xmag * mask_for_loss | |||
loss = torch.sum(torch.pow(emag - xmag, 2)) / torch.sum(nframes) | |||
return loss | |||
def psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask=None): | |||
mixed_spec = stft(mixed) | |||
clean_spec = stft(clean) | |||
targets = compute_mask(mixed_spec, clean_spec, mask_type) | |||
# [B, T, F] | |||
targets = targets.permute(0, 2, 1) | |||
loss = mask_loss(targets, masks, nframes) | |||
if subtask is not None: | |||
vadtargets = energy_vad(clean_spec) | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(targets[:, :, 0]) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:] = 0 | |||
subtask = subtask[:, :, 0] * mask_for_loss | |||
vadtargets = vadtargets * mask_for_loss | |||
loss_vad = F.binary_cross_entropy(subtask, vadtargets) | |||
return loss + loss_vad | |||
return loss | |||
def modulation_loss(mixed, clean, masks, nframes, subtask=None): | |||
mixed_spec = stft(mixed, True) | |||
clean_spec = stft(clean, True) | |||
enhanced_mag = torch.abs(mixed_spec) | |||
clean_mag = torch.abs(clean_spec) | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(clean_mag) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, :, num:] = 0 | |||
clean_mag = clean_mag * mask_for_loss | |||
enhanced_mag = enhanced_mag * mask_for_loss * masks.permute([0, 2, 1]) | |||
# Covert to log-mel representation | |||
# (B,T,#mel_channels) | |||
clean_log_mel = torch.log( | |||
torch.transpose(stft2mel(clean_mag**2), 2, 1) + 1e-8) | |||
enhanced_log_mel = torch.log( | |||
torch.transpose(stft2mel(enhanced_mag**2), 2, 1) + 1e-8) | |||
alpha = compute_mask(mixed_spec, clean_spec, mask_type) | |||
alpha = alpha.permute(0, 2, 1) | |||
loss = 0.05 * modulation_loss_module(enhanced_log_mel, clean_log_mel, | |||
alpha) | |||
loss2 = psm_vad_loss_dlen(mixed, clean, masks, nframes, subtask) | |||
# print(loss.item(), loss2.item()) #approx 1:4 | |||
loss = loss + loss2 | |||
return loss | |||
def wav2vec_loss(mixed, clean, masks, nframes, subtask=None): | |||
mixed /= 32768 | |||
clean /= 32768 | |||
mixed_spec = stft(mixed) | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(masks) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
masks_est = masks * mask_for_loss | |||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3) | |||
est_clean = istft(estimate, clean.shape[1]) | |||
loss = wav2vec_loss_module(est_clean, clean) | |||
return loss | |||
def sisdr_loss_dlen(mixed, | |||
clean, | |||
masks, | |||
nframes, | |||
subtask=None, | |||
zero_mean=True): | |||
mixed_spec = stft(mixed) | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(masks) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
masks_est = masks * mask_for_loss | |||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3) | |||
est_clean = istft(estimate, clean.shape[1]) | |||
flen = min(clean.shape[1], est_clean.shape[1]) | |||
clean = clean[:, :flen] | |||
est_clean = est_clean[:, :flen] | |||
# follow asteroid/losses/sdr.py | |||
if zero_mean: | |||
clean = clean - torch.mean(clean, dim=1, keepdim=True) | |||
est_clean = est_clean - torch.mean(est_clean, dim=1, keepdim=True) | |||
dot = torch.sum(est_clean * clean, dim=1, keepdim=True) | |||
s_clean_energy = torch.sum(clean**2, dim=1, keepdim=True) + EPS | |||
scaled_clean = dot * clean / s_clean_energy | |||
e_noise = est_clean - scaled_clean | |||
# [batch] | |||
sisdr = torch.sum( | |||
scaled_clean**2, dim=1) / ( | |||
torch.sum(e_noise**2, dim=1) + EPS) | |||
sisdr = -10 * torch.log10(sisdr + EPS) | |||
loss = sisdr.mean() | |||
return loss | |||
def sisdr_freq_loss_dlen(mixed, clean, masks, nframes, subtask=None): | |||
mixed_spec = stft(mixed) | |||
clean_spec = stft(clean) | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(masks) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
masks_est = masks * mask_for_loss | |||
estimate = mixed_spec * masks_est.permute([0, 2, 1]).unsqueeze(3) | |||
dot_real = estimate[..., 0] * clean_spec[..., 0] + \ | |||
estimate[..., 1] * clean_spec[..., 1] | |||
dot_imag = estimate[..., 0] * clean_spec[..., 1] - \ | |||
estimate[..., 1] * clean_spec[..., 0] | |||
dot = torch.cat([dot_real.unsqueeze(3), dot_imag.unsqueeze(3)], dim=-1) | |||
s_clean_energy = clean_spec[..., 0] ** 2 + \ | |||
clean_spec[..., 1] ** 2 + EPS | |||
scaled_clean = dot * clean_spec / s_clean_energy.unsqueeze(3) | |||
e_noise = estimate - scaled_clean | |||
# [batch] | |||
scaled_clean_energy = torch.sum( | |||
scaled_clean[..., 0]**2 + scaled_clean[..., 1]**2, dim=1) | |||
e_noise_energy = torch.sum( | |||
e_noise[..., 0]**2 + e_noise[..., 1]**2, dim=1) | |||
sisdr = torch.sum( | |||
scaled_clean_energy, dim=1) / ( | |||
torch.sum(e_noise_energy, dim=1) + EPS) | |||
sisdr = -10 * torch.log10(sisdr + EPS) | |||
loss = sisdr.mean() | |||
return loss | |||
def crm_loss_dlen(mixed, clean, masks, nframes, subtask=None): | |||
mixed_spec = stft(mixed).permute([0, 2, 1, 3]) | |||
clean_spec = stft(clean).permute([0, 2, 1, 3]) | |||
mixed_spec = mixed_spec / 32768 | |||
clean_spec = clean_spec / 32768 | |||
tgt_mr, tgt_mi = compute_mask(mixed_spec, clean_spec, mask_type='crm') | |||
D = int(masks.shape[2] / 2) | |||
with torch.no_grad(): | |||
mask_for_loss = torch.ones_like(clean_spec[..., 0]) | |||
for idx, num in enumerate(nframes): | |||
mask_for_loss[idx, num:, :] = 0 | |||
mr = masks[..., :D] * mask_for_loss | |||
mi = masks[..., D:] * mask_for_loss | |||
tgt_mr = tgt_mr * mask_for_loss | |||
tgt_mi = tgt_mi * mask_for_loss | |||
if weight is None: | |||
alpha = 1 | |||
else: | |||
alpha = weight - tgt_mr | |||
# signal approximation | |||
yr = mixed_spec[..., 0] | |||
yi = mixed_spec[..., 1] | |||
loss1 = torch.sum(alpha * torch.pow((mr * yr - mi * yi) - clean_spec[..., 0], 2)) \ | |||
+ torch.sum(alpha * torch.pow((mr * yi + mi * yr) - clean_spec[..., 1], 2)) | |||
# mask approximation | |||
loss2 = torch.sum(alpha * torch.pow(mr - tgt_mr, 2)) \ | |||
+ torch.sum(alpha * torch.pow(mi - tgt_mi, 2)) | |||
loss = 0.5 * (loss1 + loss2) / torch.sum(nframes) | |||
return loss | |||
def crm_miso_loss_dlen(mixed, clean, masks, nframes): | |||
return crm_loss_dlen(mixed[..., 0], clean[..., 0], masks, nframes) | |||
def mimo_loss_dlen(mixed, clean, masks, nframes): | |||
chs = mixed.shape[-1] | |||
D = masks.shape[2] // chs | |||
loss = psm_vad_loss_dlen(mixed[..., 0], clean[..., 0], masks[..., :D], | |||
nframes) | |||
for ch in range(1, chs): | |||
loss1 = psm_vad_loss_dlen(mixed[..., ch], clean[..., ch], | |||
masks[..., ch * D:ch * D + D], nframes) | |||
loss = loss + loss1 | |||
return loss / chs | |||
def spec_loss_dlen(mixed, clean, spec, nframes): | |||
clean_spec = stft(clean).permute([0, 2, 1, 3]) | |||
clean_spec = clean_spec / 32768 | |||
D = spec.shape[2] // 2 | |||
spec_est = torch.cat([spec[..., :D, None], spec[..., D:, None]], | |||
dim=-1) | |||
loss = spectrum_loss(clean_spec, spec_est, nframes) | |||
return loss | |||
if loss_func == 'psm_vad_loss_dlen': | |||
return psm_vad_loss_dlen | |||
elif loss_func == 'sisdr_loss_dlen': | |||
return sisdr_loss_dlen | |||
elif loss_func == 'sisdr_freq_loss_dlen': | |||
return sisdr_freq_loss_dlen | |||
elif loss_func == 'crm_loss_dlen': | |||
return crm_loss_dlen | |||
elif loss_func == 'modulation_loss': | |||
return modulation_loss | |||
elif loss_func == 'wav2vec_loss': | |||
return wav2vec_loss | |||
elif loss_func == 'mimo_loss_dlen': | |||
return mimo_loss_dlen | |||
elif loss_func == 'spec_loss_dlen': | |||
return spec_loss_dlen | |||
elif loss_func == 'sa_loss_dlen': | |||
return sa_loss_dlen | |||
else: | |||
print('error loss func') | |||
return None |
@@ -0,0 +1,248 @@ | |||
import math | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from torchaudio.transforms import MelScale | |||
class ModulationDomainLossModule(torch.nn.Module): | |||
"""Modulation-domain loss function developed in [1] for supervised speech enhancement | |||
In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram | |||
as the input spectrogram representation. | |||
Specific parameter details are in the paper and in the example below | |||
Parameters | |||
---------- | |||
modulation_kernels: nn.Module | |||
Differentiable module that transforms a spectrogram representation to the modulation domain | |||
modulation_domain = modulation_kernels(input_tf_representation) | |||
Input Spectrogram representation (B, T, F) ---> |(M) modulation_kernels|--->Modulation Domain(B, M, T', F') | |||
norm: boolean | |||
Normalizes the modulation domain representation to be 0 mean across time | |||
[1] T. Vuong, Y. Xia, and R. M. Stern, “A modulation-domain lossfor neural-network-based real-time | |||
speech enhancement” | |||
Accepted ICASSP 2021, https://arxiv.org/abs/2102.07330 | |||
""" | |||
def __init__(self, modulation_kernels, norm=True): | |||
super(ModulationDomainLossModule, self).__init__() | |||
self.modulation_kernels = modulation_kernels | |||
self.mse = nn.MSELoss(reduce=False) | |||
self.norm = norm | |||
def forward(self, enhanced_spect, clean_spect, weight=None): | |||
"""Calculate modulation-domain loss | |||
Args: | |||
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels). | |||
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels). | |||
Returns: | |||
Tensor: Modulation-domain loss value. | |||
""" | |||
clean_mod = self.modulation_kernels(clean_spect) | |||
enhanced_mod = self.modulation_kernels(enhanced_spect) | |||
if self.norm: | |||
mean_clean_mod = torch.mean(clean_mod, dim=2) | |||
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2) | |||
clean_mod = clean_mod - mean_clean_mod.unsqueeze(2) | |||
enhanced_mod = enhanced_mod - mean_enhanced_mod.unsqueeze(2) | |||
if weight is None: | |||
alpha = 1 | |||
else: # TF-mask weight | |||
alpha = 1 + torch.sum(weight, dim=-1, keepdim=True).unsqueeze(1) | |||
mod_mse_loss = self.mse(enhanced_mod, clean_mod) * alpha | |||
mod_mse_loss = torch.mean( | |||
torch.sum(mod_mse_loss, dim=(1, 2, 3)) | |||
/ torch.sum(clean_mod**2, dim=(1, 2, 3))) | |||
return mod_mse_loss | |||
class ModulationDomainNCCLossModule(torch.nn.Module): | |||
"""Modulation-domain loss function developed in [1] for supervised speech enhancement | |||
# Speech Intelligibility Prediction Using Spectro-Temporal Modulation Analysis - based off of this | |||
In our paper, we used the gabor-based STRF kernels as the modulation kernels and used the log-mel spectrogram | |||
as the input spectrogram representation. | |||
Specific parameter details are in the paper and in the example below | |||
Parameters | |||
---------- | |||
modulation_kernels: nn.Module | |||
Differentiable module that transforms a spectrogram representation to the modulation domain | |||
modulation_domain = modulation_kernels(input_tf_representation) | |||
Input Spectrogram representation(B, T, F) --- (M) modulation_kernels---> Modulation Domain(B, M, T', F') | |||
[1] | |||
""" | |||
def __init__(self, modulation_kernels): | |||
super(ModulationDomainNCCLossModule, self).__init__() | |||
self.modulation_kernels = modulation_kernels | |||
self.mse = nn.MSELoss(reduce=False) | |||
def forward(self, enhanced_spect, clean_spect): | |||
"""Calculate modulation-domain loss | |||
Args: | |||
enhanced_spect (Tensor): spectrogram representation of enhanced signal (B, #frames, #freq_channels). | |||
clean_spect (Tensor): spectrogram representation of clean ground-truth signal (B, #frames, #freq_channels). | |||
Returns: | |||
Tensor: Modulation-domain loss value. | |||
""" | |||
clean_mod = self.modulation_kernels(clean_spect) | |||
enhanced_mod = self.modulation_kernels(enhanced_spect) | |||
mean_clean_mod = torch.mean(clean_mod, dim=2) | |||
mean_enhanced_mod = torch.mean(enhanced_mod, dim=2) | |||
normalized_clean = clean_mod - mean_clean_mod.unsqueeze(2) | |||
normalized_enhanced = enhanced_mod - mean_enhanced_mod.unsqueeze(2) | |||
inner_product = torch.sum( | |||
normalized_clean * normalized_enhanced, dim=2) | |||
normalized_denom = (torch.sum( | |||
normalized_clean * normalized_clean, dim=2))**.5 * (torch.sum( | |||
normalized_enhanced * normalized_enhanced, dim=2))**.5 | |||
ncc = inner_product / normalized_denom | |||
mod_mse_loss = torch.mean((ncc - 1.0)**2) | |||
return mod_mse_loss | |||
class GaborSTRFConv(nn.Module): | |||
"""Gabor-STRF-based cross-correlation kernel.""" | |||
def __init__(self, | |||
supn, | |||
supk, | |||
nkern, | |||
rates=None, | |||
scales=None, | |||
norm_strf=True, | |||
real_only=False): | |||
"""Instantiate a Gabor-based STRF convolution layer. | |||
Parameters | |||
---------- | |||
supn: int | |||
Time support in number of frames. Also the window length. | |||
supk: int | |||
Frequency support in number of channels. Also the window length. | |||
nkern: int | |||
Number of kernels, each with a learnable rate and scale. | |||
rates: list of float, None | |||
Initial values for temporal modulation. | |||
scales: list of float, None | |||
Initial values for spectral modulation. | |||
norm_strf: Boolean | |||
Normalize STRF kernels to be unit length | |||
real_only: Boolean | |||
If True, nkern REAL gabor-STRF kernels | |||
If False, nkern//2 REAL and nkern//2 IMAGINARY gabor-STRF kernels | |||
""" | |||
super(GaborSTRFConv, self).__init__() | |||
self.numN = supn | |||
self.numK = supk | |||
self.numKern = nkern | |||
self.real_only = real_only | |||
self.norm_strf = norm_strf | |||
if not real_only: | |||
nkern = nkern // 2 | |||
if supk % 2 == 0: # force odd number | |||
supk += 1 | |||
self.supk = torch.arange(supk, dtype=torch.float32) | |||
if supn % 2 == 0: # force odd number | |||
supn += 1 | |||
self.supn = torch.arange(supn, dtype=self.supk.dtype) | |||
self.padding = (supn // 2, supk // 2) | |||
# Set up learnable parameters | |||
# for param in (rates, scales): | |||
# assert (not param) or len(param) == nkern | |||
if not rates: | |||
rates = torch.rand(nkern) * math.pi / 2.0 | |||
if not scales: | |||
scales = (torch.rand(nkern) * 2.0 - 1.0) * math.pi / 2.0 | |||
self.rates_ = nn.Parameter(torch.Tensor(rates)) | |||
self.scales_ = nn.Parameter(torch.Tensor(scales)) | |||
def strfs(self): | |||
"""Make STRFs using the current parameters.""" | |||
if self.supn.device != self.rates_.device: # for first run | |||
self.supn = self.supn.to(self.rates_.device) | |||
self.supk = self.supk.to(self.rates_.device) | |||
n0, k0 = self.padding | |||
nwind = .5 - .5 * \ | |||
torch.cos(2 * math.pi * (self.supn + 1) / (len(self.supn) + 1)) | |||
kwind = .5 - .5 * \ | |||
torch.cos(2 * math.pi * (self.supk + 1) / (len(self.supk) + 1)) | |||
new_wind = torch.matmul((nwind).unsqueeze(-1), (kwind).unsqueeze(0)) | |||
n_n_0 = self.supn - n0 | |||
k_k_0 = self.supk - k0 | |||
n_mult = torch.matmul( | |||
n_n_0.unsqueeze(1), | |||
torch.ones((1, len(self.supk))).type(torch.FloatTensor).to( | |||
self.rates_.device)) | |||
k_mult = torch.matmul( | |||
torch.ones((len(self.supn), | |||
1)).type(torch.FloatTensor).to(self.rates_.device), | |||
k_k_0.unsqueeze(0)) | |||
inside = self.rates_.unsqueeze(1).unsqueeze( | |||
1) * n_mult + self.scales_.unsqueeze(1).unsqueeze(1) * k_mult | |||
real_strf = torch.cos(inside) * new_wind.unsqueeze(0) | |||
if self.real_only: | |||
final_strf = real_strf | |||
else: | |||
imag_strf = torch.sin(inside) * new_wind.unsqueeze(0) | |||
final_strf = torch.cat([real_strf, imag_strf], dim=0) | |||
if self.norm_strf: | |||
final_strf = final_strf / (torch.sum( | |||
final_strf**2, dim=(1, 2)).unsqueeze(1).unsqueeze(2))**.5 | |||
return final_strf | |||
def forward(self, sigspec): | |||
"""Forward pass a batch of (real) spectra [Batch x Time x Frequency].""" | |||
if len(sigspec.shape) == 2: # expand batch dimension if single eg | |||
sigspec = sigspec.unsqueeze(0) | |||
strfs = self.strfs().unsqueeze(1).type_as(sigspec) | |||
out = F.conv2d(sigspec.unsqueeze(1), strfs, padding=self.padding) | |||
return out | |||
def __repr__(self): | |||
"""Gabor filter""" | |||
report = """ | |||
+++++ Gabor Filter Kernels [{}], supn[{}], supk[{}] real only [{}] norm strf [{}] +++++ | |||
""".format(self.numKern, self.numN, self.numK, self.real_only, | |||
self.norm_strf) | |||
return report |
@@ -0,0 +1,483 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from ..layers.activations import RectifiedLinear, Sigmoid | |||
from ..layers.affine_transform import AffineTransform | |||
from ..layers.deep_fsmn import DeepFsmn | |||
from ..layers.uni_deep_fsmn import Conv2d, UniDeepFsmn | |||
class MaskNet(nn.Module): | |||
def __init__(self, | |||
indim, | |||
outdim, | |||
layers=9, | |||
hidden_dim=128, | |||
hidden_dim2=None, | |||
lorder=20, | |||
rorder=0, | |||
dilation=1, | |||
layer_norm=False, | |||
dropout=0, | |||
crm=False, | |||
vad=False, | |||
linearout=False): | |||
super(MaskNet, self).__init__() | |||
self.linear1 = AffineTransform(indim, hidden_dim) | |||
self.relu = RectifiedLinear(hidden_dim, hidden_dim) | |||
if hidden_dim2 is None: | |||
hidden_dim2 = hidden_dim | |||
if rorder == 0: | |||
repeats = [ | |||
UniDeepFsmn( | |||
hidden_dim, | |||
hidden_dim, | |||
lorder, | |||
hidden_dim2, | |||
dilation=dilation, | |||
layer_norm=layer_norm, | |||
dropout=dropout) for i in range(layers) | |||
] | |||
else: | |||
repeats = [ | |||
DeepFsmn( | |||
hidden_dim, | |||
hidden_dim, | |||
lorder, | |||
rorder, | |||
hidden_dim2, | |||
layer_norm=layer_norm, | |||
dropout=dropout) for i in range(layers) | |||
] | |||
self.deepfsmn = nn.Sequential(*repeats) | |||
self.linear2 = AffineTransform(hidden_dim, outdim) | |||
self.crm = crm | |||
if self.crm: | |||
self.sig = nn.Tanh() | |||
else: | |||
self.sig = Sigmoid(outdim, outdim) | |||
self.vad = vad | |||
if self.vad: | |||
self.linear3 = AffineTransform(hidden_dim, 1) | |||
self.layers = layers | |||
self.linearout = linearout | |||
if self.linearout and self.vad: | |||
print('Warning: not supported nnet') | |||
def forward(self, feat, ctl=None): | |||
x1 = self.linear1(feat) | |||
x2 = self.relu(x1) | |||
if ctl is not None: | |||
ctl = min(ctl, self.layers - 1) | |||
for i in range(ctl): | |||
x2 = self.deepfsmn[i](x2) | |||
mask = self.sig(self.linear2(x2)) | |||
if self.vad: | |||
vad = torch.sigmoid(self.linear3(x2)) | |||
return mask, vad | |||
else: | |||
return mask | |||
x3 = self.deepfsmn(x2) | |||
if self.linearout: | |||
return self.linear2(x3) | |||
mask = self.sig(self.linear2(x3)) | |||
if self.vad: | |||
vad = torch.sigmoid(self.linear3(x3)) | |||
return mask, vad | |||
else: | |||
return mask | |||
def to_kaldi_nnet(self): | |||
re_str = '' | |||
re_str += '<Nnet>\n' | |||
re_str += self.linear1.to_kaldi_nnet() | |||
re_str += self.relu.to_kaldi_nnet() | |||
for dfsmn in self.deepfsmn: | |||
re_str += dfsmn.to_kaldi_nnet() | |||
re_str += self.linear2.to_kaldi_nnet() | |||
re_str += self.sig.to_kaldi_nnet() | |||
re_str += '</Nnet>\n' | |||
return re_str | |||
def to_raw_nnet(self, fid): | |||
self.linear1.to_raw_nnet(fid) | |||
for dfsmn in self.deepfsmn: | |||
dfsmn.to_raw_nnet(fid) | |||
self.linear2.to_raw_nnet(fid) | |||
class StageNet(nn.Module): | |||
def __init__(self, | |||
indim, | |||
outdim, | |||
layers=9, | |||
layers2=6, | |||
hidden_dim=128, | |||
lorder=20, | |||
rorder=0, | |||
layer_norm=False, | |||
dropout=0, | |||
crm=False, | |||
vad=False, | |||
linearout=False): | |||
super(StageNet, self).__init__() | |||
self.stage1 = nn.ModuleList() | |||
self.stage2 = nn.ModuleList() | |||
layer = nn.Sequential(nn.Linear(indim, hidden_dim), nn.ReLU()) | |||
self.stage1.append(layer) | |||
for i in range(layers): | |||
layer = UniDeepFsmn( | |||
hidden_dim, | |||
hidden_dim, | |||
lorder, | |||
hidden_dim, | |||
layer_norm=layer_norm, | |||
dropout=dropout) | |||
self.stage1.append(layer) | |||
layer = nn.Sequential(nn.Linear(hidden_dim, 321), nn.Sigmoid()) | |||
self.stage1.append(layer) | |||
# stage2 | |||
layer = nn.Sequential(nn.Linear(321 + indim, hidden_dim), nn.ReLU()) | |||
self.stage2.append(layer) | |||
for i in range(layers2): | |||
layer = UniDeepFsmn( | |||
hidden_dim, | |||
hidden_dim, | |||
lorder, | |||
hidden_dim, | |||
layer_norm=layer_norm, | |||
dropout=dropout) | |||
self.stage2.append(layer) | |||
layer = nn.Sequential( | |||
nn.Linear(hidden_dim, outdim), | |||
nn.Sigmoid() if not crm else nn.Tanh()) | |||
self.stage2.append(layer) | |||
self.crm = crm | |||
self.vad = vad | |||
self.linearout = linearout | |||
self.window = torch.hamming_window(640, periodic=False).cuda() | |||
self.freezed = False | |||
def freeze(self): | |||
if not self.freezed: | |||
for param in self.stage1.parameters(): | |||
param.requires_grad = False | |||
self.freezed = True | |||
print('freezed stage1') | |||
def forward(self, feat, mixture, ctl=None): | |||
if ctl == 'off': | |||
x = feat | |||
for i in range(len(self.stage1)): | |||
x = self.stage1[i](x) | |||
return x | |||
else: | |||
self.freeze() | |||
x = feat | |||
for i in range(len(self.stage1)): | |||
x = self.stage1[i](x) | |||
spec = torch.stft( | |||
mixture / 32768, | |||
640, | |||
320, | |||
640, | |||
self.window, | |||
center=False, | |||
return_complex=True) | |||
spec = torch.view_as_real(spec).permute([0, 2, 1, 3]) | |||
specmag = torch.sqrt(spec[..., 0]**2 + spec[..., 1]**2) | |||
est = x * specmag | |||
y = torch.cat([est, feat], dim=-1) | |||
for i in range(len(self.stage2)): | |||
y = self.stage2[i](y) | |||
return y | |||
class Unet(nn.Module): | |||
def __init__(self, | |||
indim, | |||
outdim, | |||
layers=9, | |||
dims=[256] * 4, | |||
lorder=20, | |||
rorder=0, | |||
dilation=1, | |||
layer_norm=False, | |||
dropout=0, | |||
crm=False, | |||
vad=False, | |||
linearout=False): | |||
super(Unet, self).__init__() | |||
self.linear1 = AffineTransform(indim, dims[0]) | |||
self.relu = RectifiedLinear(dims[0], dims[0]) | |||
self.encoder = nn.ModuleList() | |||
self.decoder = nn.ModuleList() | |||
for i in range(len(dims) - 1): | |||
layer = nn.Sequential( | |||
nn.Linear(dims[i], dims[i + 1]), nn.ReLU(), | |||
nn.Linear(dims[i + 1], dims[i + 1], bias=False), | |||
Conv2d( | |||
dims[i + 1], | |||
dims[i + 1], | |||
lorder, | |||
groups=dims[i + 1], | |||
skip_connect=True)) | |||
self.encoder.append(layer) | |||
for i in range(len(dims) - 1, 0, -1): | |||
layer = nn.Sequential( | |||
nn.Linear(dims[i] * 2, dims[i - 1]), nn.ReLU(), | |||
nn.Linear(dims[i - 1], dims[i - 1], bias=False), | |||
Conv2d( | |||
dims[i - 1], | |||
dims[i - 1], | |||
lorder, | |||
groups=dims[i - 1], | |||
skip_connect=True)) | |||
self.decoder.append(layer) | |||
self.tf = nn.ModuleList() | |||
for i in range(layers - 2 * (len(dims) - 1)): | |||
layer = nn.Sequential( | |||
nn.Linear(dims[-1], dims[-1]), nn.ReLU(), | |||
nn.Linear(dims[-1], dims[-1], bias=False), | |||
Conv2d( | |||
dims[-1], | |||
dims[-1], | |||
lorder, | |||
groups=dims[-1], | |||
skip_connect=True)) | |||
self.tf.append(layer) | |||
self.linear2 = AffineTransform(dims[0], outdim) | |||
self.crm = crm | |||
self.act = nn.Tanh() if self.crm else nn.Sigmoid() | |||
self.vad = False | |||
self.layers = layers | |||
self.linearout = linearout | |||
def forward(self, x, ctl=None): | |||
x = self.linear1(x) | |||
x = self.relu(x) | |||
encoder_out = [] | |||
for i in range(len(self.encoder)): | |||
x = self.encoder[i](x) | |||
encoder_out.append(x) | |||
for i in range(len(self.tf)): | |||
x = self.tf[i](x) | |||
for i in range(len(self.decoder)): | |||
x = torch.cat([x, encoder_out[-1 - i]], dim=-1) | |||
x = self.decoder[i](x) | |||
x = self.linear2(x) | |||
if self.linearout: | |||
return x | |||
return self.act(x) | |||
class BranchNet(nn.Module): | |||
def __init__(self, | |||
indim, | |||
outdim, | |||
layers=9, | |||
hidden_dim=256, | |||
lorder=20, | |||
rorder=0, | |||
dilation=1, | |||
layer_norm=False, | |||
dropout=0, | |||
crm=False, | |||
vad=False, | |||
linearout=False): | |||
super(BranchNet, self).__init__() | |||
self.linear1 = AffineTransform(indim, hidden_dim) | |||
self.relu = RectifiedLinear(hidden_dim, hidden_dim) | |||
self.convs = nn.ModuleList() | |||
self.deepfsmn = nn.ModuleList() | |||
self.FREQ = nn.ModuleList() | |||
self.TIME = nn.ModuleList() | |||
self.br1 = nn.ModuleList() | |||
self.br2 = nn.ModuleList() | |||
for i in range(layers): | |||
''' | |||
layer = nn.Sequential( | |||
nn.Linear(hidden_dim, hidden_dim), | |||
nn.ReLU(), | |||
nn.Linear(hidden_dim, hidden_dim, bias=False), | |||
Conv2d(hidden_dim, hidden_dim, lorder, | |||
groups=hidden_dim, skip_connect=True) | |||
) | |||
self.deepfsmn.append(layer) | |||
''' | |||
layer = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU()) | |||
self.FREQ.append(layer) | |||
''' | |||
layer = nn.GRU(hidden_dim, hidden_dim, | |||
batch_first=True, | |||
bidirectional=False) | |||
self.TIME.append(layer) | |||
layer = nn.Sequential( | |||
nn.Linear(hidden_dim, hidden_dim//2, bias=False), | |||
Conv2d(hidden_dim//2, hidden_dim//2, lorder, | |||
groups=hidden_dim//2, skip_connect=True) | |||
) | |||
self.br1.append(layer) | |||
layer = nn.GRU(hidden_dim, hidden_dim//2, | |||
batch_first=True, | |||
bidirectional=False) | |||
self.br2.append(layer) | |||
''' | |||
self.linear2 = AffineTransform(hidden_dim, outdim) | |||
self.crm = crm | |||
self.act = nn.Tanh() if self.crm else nn.Sigmoid() | |||
self.vad = False | |||
self.layers = layers | |||
self.linearout = linearout | |||
def forward(self, x, ctl=None): | |||
return self.forward_branch(x) | |||
def forward_sepconv(self, x): | |||
x = torch.unsqueeze(x, 1) | |||
for i in range(len(self.convs)): | |||
x = self.convs[i](x) | |||
x = F.relu(x) | |||
B, C, H, W = x.shape | |||
x = x.permute(0, 2, 1, 3) | |||
x = torch.reshape(x, [B, H, C * W]) | |||
x = self.linear1(x) | |||
x = self.relu(x) | |||
for i in range(self.layers): | |||
x = self.deepfsmn[i](x) + x | |||
x = self.linear2(x) | |||
return self.act(x) | |||
def forward_branch(self, x): | |||
x = self.linear1(x) | |||
x = self.relu(x) | |||
for i in range(self.layers): | |||
z = self.FREQ[i](x) | |||
x = z + x | |||
x = self.linear2(x) | |||
if self.linearout: | |||
return x | |||
return self.act(x) | |||
class TACNet(nn.Module): | |||
''' transform average concatenate for ad hoc dr | |||
''' | |||
def __init__(self, | |||
indim, | |||
outdim, | |||
layers=9, | |||
hidden_dim=128, | |||
lorder=20, | |||
rorder=0, | |||
crm=False, | |||
vad=False, | |||
linearout=False): | |||
super(TACNet, self).__init__() | |||
self.linear1 = AffineTransform(indim, hidden_dim) | |||
self.relu = RectifiedLinear(hidden_dim, hidden_dim) | |||
if rorder == 0: | |||
repeats = [ | |||
UniDeepFsmn(hidden_dim, hidden_dim, lorder, hidden_dim) | |||
for i in range(layers) | |||
] | |||
else: | |||
repeats = [ | |||
DeepFsmn(hidden_dim, hidden_dim, lorder, rorder, hidden_dim) | |||
for i in range(layers) | |||
] | |||
self.deepfsmn = nn.Sequential(*repeats) | |||
self.ch_transform = nn.ModuleList([]) | |||
self.ch_average = nn.ModuleList([]) | |||
self.ch_concat = nn.ModuleList([]) | |||
for i in range(layers): | |||
self.ch_transform.append( | |||
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU())) | |||
self.ch_average.append( | |||
nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.PReLU())) | |||
self.ch_concat.append( | |||
nn.Sequential( | |||
nn.Linear(hidden_dim * 2, hidden_dim), nn.PReLU())) | |||
self.linear2 = AffineTransform(hidden_dim, outdim) | |||
self.crm = crm | |||
if self.crm: | |||
self.sig = nn.Tanh() | |||
else: | |||
self.sig = Sigmoid(outdim, outdim) | |||
self.vad = vad | |||
if self.vad: | |||
self.linear3 = AffineTransform(hidden_dim, 1) | |||
self.layers = layers | |||
self.linearout = linearout | |||
if self.linearout and self.vad: | |||
print('Warning: not supported nnet') | |||
def forward(self, feat, ctl=None): | |||
B, T, F = feat.shape | |||
# assume 4ch | |||
ch = 4 | |||
zlist = [] | |||
for c in range(ch): | |||
z = self.linear1(feat[..., c * (F // 4):(c + 1) * (F // 4)]) | |||
z = self.relu(z) | |||
zlist.append(z) | |||
for i in range(self.layers): | |||
# forward | |||
for c in range(ch): | |||
zlist[c] = self.deepfsmn[i](zlist[c]) | |||
# transform | |||
olist = [] | |||
for c in range(ch): | |||
z = self.ch_transform[i](zlist[c]) | |||
olist.append(z) | |||
# average | |||
avg = 0 | |||
for c in range(ch): | |||
avg = avg + olist[c] | |||
avg = avg / ch | |||
avg = self.ch_average[i](avg) | |||
# concate | |||
for c in range(ch): | |||
tac = torch.cat([olist[c], avg], dim=-1) | |||
tac = self.ch_concat[i](tac) | |||
zlist[c] = zlist[c] + tac | |||
for c in range(ch): | |||
zlist[c] = self.sig(self.linear2(zlist[c])) | |||
mask = torch.cat(zlist, dim=-1) | |||
return mask | |||
def to_kaldi_nnet(self): | |||
pass |
@@ -0,0 +1 @@ | |||
from .sambert_hifi_16k import * # noqa F403 |
@@ -0,0 +1,8 @@ | |||
from .robutrans import RobuTrans | |||
def create_model(name, hparams): | |||
if name == 'robutrans': | |||
return RobuTrans(hparams) | |||
else: | |||
raise Exception('Unknown model: ' + name) |
@@ -0,0 +1,82 @@ | |||
"""Functions for compatibility with different TensorFlow versions.""" | |||
import tensorflow as tf | |||
def is_tf2(): | |||
"""Returns ``True`` if running TensorFlow 2.0.""" | |||
return tf.__version__.startswith('2') | |||
def tf_supports(symbol): | |||
"""Returns ``True`` if TensorFlow defines :obj:`symbol`.""" | |||
return _string_to_tf_symbol(symbol) is not None | |||
def tf_any(*symbols): | |||
"""Returns the first supported symbol.""" | |||
for symbol in symbols: | |||
module = _string_to_tf_symbol(symbol) | |||
if module is not None: | |||
return module | |||
return None | |||
def tf_compat(v2=None, v1=None): # pylint: disable=invalid-name | |||
"""Returns the compatible symbol based on the current TensorFlow version. | |||
Args: | |||
v2: The candidate v2 symbol name. | |||
v1: The candidate v1 symbol name. | |||
Returns: | |||
A TensorFlow symbol. | |||
Raises: | |||
ValueError: if no symbol can be found. | |||
""" | |||
candidates = [] | |||
if v2 is not None: | |||
candidates.append(v2) | |||
if v1 is not None: | |||
candidates.append(v1) | |||
candidates.append('compat.v1.%s' % v1) | |||
symbol = tf_any(*candidates) | |||
if symbol is None: | |||
raise ValueError('Failure to resolve the TensorFlow symbol') | |||
return symbol | |||
def name_from_variable_scope(name=''): | |||
"""Creates a name prefixed by the current variable scope.""" | |||
var_scope = tf_compat(v1='get_variable_scope')().name | |||
compat_name = '' | |||
if name: | |||
compat_name = '%s/' % name | |||
if var_scope: | |||
compat_name = '%s/%s' % (var_scope, compat_name) | |||
return compat_name | |||
def reuse(): | |||
"""Returns ``True`` if the current variable scope is marked for reuse.""" | |||
return tf_compat(v1='get_variable_scope')().reuse | |||
def _string_to_tf_symbol(symbol): | |||
modules = symbol.split('.') | |||
namespace = tf | |||
for module in modules: | |||
namespace = getattr(namespace, module, None) | |||
if namespace is None: | |||
return None | |||
return namespace | |||
# pylint: disable=invalid-name | |||
gfile_copy = tf_compat(v2='io.gfile.copy', v1='gfile.Copy') | |||
gfile_exists = tf_compat(v2='io.gfile.exists', v1='gfile.Exists') | |||
gfile_open = tf_compat(v2='io.gfile.GFile', v1='gfile.GFile') | |||
is_tensor = tf_compat(v2='is_tensor', v1='contrib.framework.is_tensor') | |||
logging = tf_compat(v1='logging') | |||
nest = tf_compat(v2='nest', v1='contrib.framework.nest') |
@@ -0,0 +1,273 @@ | |||
import tensorflow as tf | |||
def build_sequence_mask(sequence_length, | |||
maximum_length=None, | |||
dtype=tf.float32): | |||
"""Builds the dot product mask. | |||
Args: | |||
sequence_length: The sequence length. | |||
maximum_length: Optional size of the returned time dimension. Otherwise | |||
it is the maximum of :obj:`sequence_length`. | |||
dtype: The type of the mask tensor. | |||
Returns: | |||
A broadcastable ``tf.Tensor`` of type :obj:`dtype` and shape | |||
``[batch_size, max_length]``. | |||
""" | |||
mask = tf.sequence_mask( | |||
sequence_length, maxlen=maximum_length, dtype=dtype) | |||
return mask | |||
def norm(inputs): | |||
"""Layer normalizes :obj:`inputs`.""" | |||
return tf.contrib.layers.layer_norm(inputs, begin_norm_axis=-1) | |||
def pad_in_time(x, padding_shape): | |||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
Agrs: | |||
x: [Batch, Time, Frequency] | |||
padding_length: padding size of constant value (0) before the time dimension | |||
return: | |||
padded x | |||
""" | |||
depth = x.get_shape().as_list()[-1] | |||
x = tf.pad(x, [[0, 0], padding_shape, [0, 0]]) | |||
x.set_shape((None, None, depth)) | |||
return x | |||
def pad_in_time_right(x, padding_length): | |||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension. | |||
Agrs: | |||
x: [Batch, Time, Frequency] | |||
padding_length: padding size of constant value (0) before the time dimension | |||
return: | |||
padded x | |||
""" | |||
depth = x.get_shape().as_list()[-1] | |||
x = tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
x.set_shape((None, None, depth)) | |||
return x | |||
def feed_forward(x, ffn_dim, memory_units, mode, dropout=0.0): | |||
"""Implements the Transformer's "Feed Forward" layer. | |||
.. math:: | |||
ffn(x) = max(0, x*W_1 + b_1)*W_2 | |||
Args: | |||
x: The input. | |||
ffn_dim: The number of units of the nonlinear transformation. | |||
memory_units: the number of units of linear transformation | |||
mode: A ``tf.estimator.ModeKeys`` mode. | |||
dropout: The probability to drop units from the inner transformation. | |||
Returns: | |||
The transformed input. | |||
""" | |||
inner = tf.layers.conv1d(x, ffn_dim, 1, activation=tf.nn.relu) | |||
inner = tf.layers.dropout( | |||
inner, rate=dropout, training=mode == tf.estimator.ModeKeys.TRAIN) | |||
outer = tf.layers.conv1d(inner, memory_units, 1, use_bias=False) | |||
return outer | |||
def drop_and_add(inputs, outputs, mode, dropout=0.0): | |||
"""Drops units in the outputs and adds the previous values. | |||
Args: | |||
inputs: The input of the previous layer. | |||
outputs: The output of the previous layer. | |||
mode: A ``tf.estimator.ModeKeys`` mode. | |||
dropout: The probability to drop units in :obj:`outputs`. | |||
Returns: | |||
The residual and normalized output. | |||
""" | |||
outputs = tf.layers.dropout(outputs, rate=dropout, training=mode) | |||
input_dim = inputs.get_shape().as_list()[-1] | |||
output_dim = outputs.get_shape().as_list()[-1] | |||
if input_dim == output_dim: | |||
outputs += inputs | |||
return outputs | |||
def MemoryBlock( | |||
inputs, | |||
filter_size, | |||
mode, | |||
mask=None, | |||
dropout=0.0, | |||
): | |||
""" | |||
Define the bidirectional memory block in FSMN | |||
Agrs: | |||
inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
filter_size: memory block filter size | |||
mode: Training or Evaluation | |||
mask: A ``tf.Tensor`` applied to the memory block output | |||
return: | |||
output: 3-D tensor ([Batch, Time, Frequency]) | |||
""" | |||
static_shape = inputs.get_shape().as_list() | |||
depth = static_shape[-1] | |||
inputs = tf.expand_dims(inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
depthwise_filter = tf.get_variable( | |||
'depth_conv_w', | |||
shape=[1, filter_size, depth, 1], | |||
initializer=tf.glorot_uniform_initializer(), | |||
dtype=tf.float32) | |||
memory = tf.nn.depthwise_conv2d( | |||
input=inputs, | |||
filter=depthwise_filter, | |||
strides=[1, 1, 1, 1], | |||
padding='SAME', | |||
rate=[1, 1], | |||
data_format='NHWC') | |||
memory = memory + inputs | |||
output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
output = tf.reshape( | |||
output, | |||
[tf.shape(output)[0], tf.shape(output)[2], depth]) | |||
if mask is not None: | |||
output = output * tf.expand_dims(mask, -1) | |||
return output | |||
def MemoryBlockV2( | |||
inputs, | |||
filter_size, | |||
mode, | |||
shift=0, | |||
mask=None, | |||
dropout=0.0, | |||
): | |||
""" | |||
Define the bidirectional memory block in FSMN | |||
Agrs: | |||
inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
filter_size: memory block filter size | |||
mode: Training or Evaluation | |||
shift: left padding, to control delay | |||
mask: A ``tf.Tensor`` applied to the memory block output | |||
return: | |||
output: 3-D tensor ([Batch, Time, Frequency]) | |||
""" | |||
if mask is not None: | |||
inputs = inputs * tf.expand_dims(mask, -1) | |||
static_shape = inputs.get_shape().as_list() | |||
depth = static_shape[-1] | |||
# padding | |||
left_padding = int(round((filter_size - 1) / 2)) | |||
right_padding = int((filter_size - 1) / 2) | |||
if shift > 0: | |||
left_padding = left_padding + shift | |||
right_padding = right_padding - shift | |||
pad_inputs = pad_in_time(inputs, [left_padding, right_padding]) | |||
pad_inputs = tf.expand_dims( | |||
pad_inputs, axis=1) # [Batch, 1, Time, Frequency] | |||
depthwise_filter = tf.get_variable( | |||
'depth_conv_w', | |||
shape=[1, filter_size, depth, 1], | |||
initializer=tf.glorot_uniform_initializer(), | |||
dtype=tf.float32) | |||
memory = tf.nn.depthwise_conv2d( | |||
input=pad_inputs, | |||
filter=depthwise_filter, | |||
strides=[1, 1, 1, 1], | |||
padding='VALID', | |||
rate=[1, 1], | |||
data_format='NHWC') | |||
memory = tf.reshape( | |||
memory, | |||
[tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
memory = memory + inputs | |||
output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
if mask is not None: | |||
output = output * tf.expand_dims(mask, -1) | |||
return output | |||
def UniMemoryBlock( | |||
inputs, | |||
filter_size, | |||
mode, | |||
cache=None, | |||
mask=None, | |||
dropout=0.0, | |||
): | |||
""" | |||
Define the unidirectional memory block in FSMN | |||
Agrs: | |||
inputs: The output of the previous layer. [Batch, Time, Frequency] | |||
filter_size: memory block filter size | |||
cache: for streaming inference | |||
mode: Training or Evaluation | |||
mask: A ``tf.Tensor`` applied to the memory block output | |||
dropout: dorpout factor | |||
return: | |||
output: 3-D tensor ([Batch, Time, Frequency]) | |||
""" | |||
if cache is not None: | |||
static_shape = cache['queries'].get_shape().as_list() | |||
depth = static_shape[-1] | |||
queries = tf.slice(cache['queries'], [0, 1, 0], [ | |||
tf.shape(cache['queries'])[0], | |||
tf.shape(cache['queries'])[1] - 1, depth | |||
]) | |||
queries = tf.concat([queries, inputs], axis=1) | |||
cache['queries'] = queries | |||
else: | |||
padding_length = filter_size - 1 | |||
queries = pad_in_time(inputs, [padding_length, 0]) | |||
queries = tf.expand_dims(queries, axis=1) # [Batch, 1, Time, Frequency] | |||
static_shape = queries.get_shape().as_list() | |||
depth = static_shape[-1] | |||
depthwise_filter = tf.get_variable( | |||
'depth_conv_w', | |||
shape=[1, filter_size, depth, 1], | |||
initializer=tf.glorot_uniform_initializer(), | |||
dtype=tf.float32) | |||
memory = tf.nn.depthwise_conv2d( | |||
input=queries, | |||
filter=depthwise_filter, | |||
strides=[1, 1, 1, 1], | |||
padding='VALID', | |||
rate=[1, 1], | |||
data_format='NHWC') | |||
memory = tf.reshape( | |||
memory, | |||
[tf.shape(memory)[0], tf.shape(memory)[2], depth]) | |||
memory = memory + inputs | |||
output = tf.layers.dropout(memory, rate=dropout, training=mode) | |||
if mask is not None: | |||
output = output * tf.expand_dims(mask, -1) | |||
return output |
@@ -0,0 +1,178 @@ | |||
import tensorflow as tf | |||
from . import fsmn | |||
class FsmnEncoder(): | |||
"""Encoder using Fsmn | |||
""" | |||
def __init__(self, | |||
filter_size, | |||
fsmn_num_layers, | |||
dnn_num_layers, | |||
num_memory_units=512, | |||
ffn_inner_dim=2048, | |||
dropout=0.0, | |||
position_encoder=None): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
filter_size: the total order of memory block | |||
fsmn_num_layers: The number of fsmn layers. | |||
dnn_num_layers: The number of dnn layers | |||
num_units: The number of memory units. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(FsmnEncoder, self).__init__() | |||
self.filter_size = filter_size | |||
self.fsmn_num_layers = fsmn_num_layers | |||
self.dnn_num_layers = dnn_num_layers | |||
self.num_memory_units = num_memory_units | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = fsmn.build_sequence_mask( | |||
sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
state = () | |||
for layer in range(self.fsmn_num_layers): | |||
with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
with tf.variable_scope('ffn'): | |||
context = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
with tf.variable_scope('memory'): | |||
memory = fsmn.MemoryBlock( | |||
context, | |||
self.filter_size, | |||
mode, | |||
mask=mask, | |||
dropout=self.dropout) | |||
memory = fsmn.drop_and_add( | |||
inputs, memory, mode, dropout=self.dropout) | |||
inputs = memory | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
for layer in range(self.dnn_num_layers): | |||
with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
transformed = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = inputs | |||
return (outputs, state, sequence_length) | |||
class FsmnEncoderV2(): | |||
"""Encoder using Fsmn | |||
""" | |||
def __init__(self, | |||
filter_size, | |||
fsmn_num_layers, | |||
dnn_num_layers, | |||
num_memory_units=512, | |||
ffn_inner_dim=2048, | |||
dropout=0.0, | |||
shift=0, | |||
position_encoder=None): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
filter_size: the total order of memory block | |||
fsmn_num_layers: The number of fsmn layers. | |||
dnn_num_layers: The number of dnn layers | |||
num_units: The number of memory units. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
shift: left padding, to control delay | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(FsmnEncoderV2, self).__init__() | |||
self.filter_size = filter_size | |||
self.fsmn_num_layers = fsmn_num_layers | |||
self.dnn_num_layers = dnn_num_layers | |||
self.num_memory_units = num_memory_units | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.shift = shift | |||
if not isinstance(shift, list): | |||
self.shift = [shift for _ in range(self.fsmn_num_layers)] | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = fsmn.build_sequence_mask( | |||
sequence_length, maximum_length=tf.shape(inputs)[1]) | |||
state = () | |||
for layer in range(self.fsmn_num_layers): | |||
with tf.variable_scope('fsmn_layer_{}'.format(layer)): | |||
with tf.variable_scope('ffn'): | |||
context = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
with tf.variable_scope('memory'): | |||
memory = fsmn.MemoryBlockV2( | |||
context, | |||
self.filter_size, | |||
mode, | |||
shift=self.shift[layer], | |||
mask=mask, | |||
dropout=self.dropout) | |||
memory = fsmn.drop_and_add( | |||
inputs, memory, mode, dropout=self.dropout) | |||
inputs = memory | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
for layer in range(self.dnn_num_layers): | |||
with tf.variable_scope('dnn_layer_{}'.format(layer)): | |||
transformed = fsmn.feed_forward( | |||
inputs, | |||
self.ffn_inner_dim, | |||
self.num_memory_units, | |||
mode, | |||
dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = inputs | |||
return (outputs, state, sequence_length) |
@@ -0,0 +1,160 @@ | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.seq2seq import Helper | |||
class VarTestHelper(Helper): | |||
def __init__(self, batch_size, inputs, dim): | |||
with tf.name_scope('VarTestHelper'): | |||
self._batch_size = batch_size | |||
self._inputs = inputs | |||
self._dim = dim | |||
num_steps = tf.shape(self._inputs)[1] | |||
self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
self._init_inputs = inputs[:, 0, :] | |||
@property | |||
def batch_size(self): | |||
return self._batch_size | |||
@property | |||
def sample_ids_shape(self): | |||
return tf.TensorShape([]) | |||
@property | |||
def sample_ids_dtype(self): | |||
return np.int32 | |||
def initialize(self, name=None): | |||
return (tf.tile([False], [self._batch_size]), | |||
_go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
def sample(self, time, outputs, state, name=None): | |||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
with tf.name_scope('VarTestHelper'): | |||
finished = (time + 1 >= self._lengths) | |||
next_inputs = tf.concat([outputs, self._inputs[:, time, :]], | |||
axis=-1) | |||
return (finished, next_inputs, state) | |||
class VarTrainingHelper(Helper): | |||
def __init__(self, targets, inputs, dim): | |||
with tf.name_scope('VarTrainingHelper'): | |||
self._targets = targets # [N, T_in, 1] | |||
self._batch_size = tf.shape(inputs)[0] # N | |||
self._inputs = inputs | |||
self._dim = dim | |||
num_steps = tf.shape(self._targets)[1] | |||
self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
self._init_inputs = inputs[:, 0, :] | |||
@property | |||
def batch_size(self): | |||
return self._batch_size | |||
@property | |||
def sample_ids_shape(self): | |||
return tf.TensorShape([]) | |||
@property | |||
def sample_ids_dtype(self): | |||
return np.int32 | |||
def initialize(self, name=None): | |||
return (tf.tile([False], [self._batch_size]), | |||
_go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
def sample(self, time, outputs, state, name=None): | |||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
with tf.name_scope(name or 'VarTrainingHelper'): | |||
finished = (time + 1 >= self._lengths) | |||
next_inputs = tf.concat( | |||
[self._targets[:, time, :], self._inputs[:, time, :]], axis=-1) | |||
return (finished, next_inputs, state) | |||
class VarTrainingSSHelper(Helper): | |||
def __init__(self, targets, inputs, dim, global_step, schedule_begin, | |||
alpha, decay_steps): | |||
with tf.name_scope('VarTrainingSSHelper'): | |||
self._targets = targets # [N, T_in, 1] | |||
self._batch_size = tf.shape(inputs)[0] # N | |||
self._inputs = inputs | |||
self._dim = dim | |||
num_steps = tf.shape(self._targets)[1] | |||
self._lengths = tf.tile([num_steps], [self._batch_size]) | |||
self._inputs = tf.roll(inputs, shift=-1, axis=1) | |||
self._init_inputs = inputs[:, 0, :] | |||
# for schedule sampling | |||
self._global_step = global_step | |||
self._schedule_begin = schedule_begin | |||
self._alpha = alpha | |||
self._decay_steps = decay_steps | |||
@property | |||
def batch_size(self): | |||
return self._batch_size | |||
@property | |||
def sample_ids_shape(self): | |||
return tf.TensorShape([]) | |||
@property | |||
def sample_ids_dtype(self): | |||
return np.int32 | |||
def initialize(self, name=None): | |||
self._ratio = _tf_decay(self._global_step, self._schedule_begin, | |||
self._alpha, self._decay_steps) | |||
return (tf.tile([False], [self._batch_size]), | |||
_go_frames(self._batch_size, self._dim, self._init_inputs)) | |||
def sample(self, time, outputs, state, name=None): | |||
return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them | |||
def next_inputs(self, time, outputs, state, sample_ids, name=None): | |||
with tf.name_scope(name or 'VarTrainingHelper'): | |||
finished = (time + 1 >= self._lengths) | |||
next_inputs_tmp = tf.cond( | |||
tf.less( | |||
tf.random_uniform([], minval=0, maxval=1, | |||
dtype=tf.float32), self._ratio), | |||
lambda: self._targets[:, time, :], lambda: outputs) | |||
next_inputs = tf.concat( | |||
[next_inputs_tmp, self._inputs[:, time, :]], axis=-1) | |||
return (finished, next_inputs, state) | |||
def _go_frames(batch_size, dim, init_inputs): | |||
'''Returns all-zero <GO> frames for a given batch size and output dimension''' | |||
return tf.concat([tf.tile([[0.0]], [batch_size, dim]), init_inputs], | |||
axis=-1) | |||
def _tf_decay(global_step, schedule_begin, alpha, decay_steps): | |||
tfr = tf.train.exponential_decay( | |||
1.0, | |||
global_step=global_step - schedule_begin, | |||
decay_steps=decay_steps, | |||
decay_rate=alpha, | |||
name='tfr_decay') | |||
final_tfr = tf.cond( | |||
tf.less(global_step, schedule_begin), lambda: 1.0, lambda: tfr) | |||
return final_tfr |
@@ -0,0 +1,461 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||
from tensorflow.contrib.rnn import LSTMBlockCell | |||
def encoder_prenet(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
dense_units, | |||
is_training, | |||
mask=None, | |||
scope='encoder_prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.layers.dense( | |||
x, units=dense_units, activation=None, name='dense') | |||
return x | |||
def decoder_prenet(inputs, | |||
prenet_units, | |||
dense_units, | |||
is_training, | |||
scope='decoder_prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i, units in enumerate(prenet_units): | |||
x = tf.layers.dense( | |||
x, | |||
units=units, | |||
activation=tf.nn.relu, | |||
name='dense_{}'.format(i)) | |||
x = tf.layers.dropout( | |||
x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
x = tf.layers.dense( | |||
x, units=dense_units, activation=None, name='dense') | |||
return x | |||
def encoder(inputs, | |||
input_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='encoder'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_lstm( | |||
inputs, | |||
input_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=mask) | |||
return x | |||
def prenet(inputs, prenet_units, is_training, scope='prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i, units in enumerate(prenet_units): | |||
x = tf.layers.dense( | |||
x, | |||
units=units, | |||
activation=tf.nn.relu, | |||
name='dense_{}'.format(i)) | |||
x = tf.layers.dropout( | |||
x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) | |||
return x | |||
def postnet_residual_ulstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
is_training, | |||
scope='postnet_residual_ulstm'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
lstm_units, is_training) | |||
x = conv1d( | |||
x, | |||
output_units, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
return x | |||
def postnet_residual_lstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
is_training, | |||
scope='postnet_residual_lstm'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_lstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
lstm_units, is_training) | |||
x = conv1d( | |||
x, | |||
output_units, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
scope='conv1d_{}'.format(n_conv_layers - 1)) | |||
return x | |||
def postnet_linear_ulstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
is_training, | |||
scope='postnet_linear'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_ulstm(inputs, None, n_conv_layers, filters, kernel_size, | |||
lstm_units, is_training) | |||
x = tf.layers.dense(x, units=output_units) | |||
return x | |||
def postnet_linear_lstm(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
output_lengths, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='postnet_linear'): | |||
with tf.variable_scope(scope): | |||
x = conv_and_lstm_dec( | |||
inputs, | |||
output_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=mask) | |||
x = tf.layers.dense(x, units=output_units) | |||
return x | |||
def postnet_linear(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
output_units, | |||
output_lengths, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='postnet_linear'): | |||
with tf.variable_scope(scope): | |||
x = conv_dec( | |||
inputs, | |||
output_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=mask) | |||
return x | |||
def conv_and_lstm(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=sequence_lengths, | |||
dtype=tf.float32) | |||
x = tf.concat(outputs, axis=-1) | |||
return x | |||
def conv_and_lstm_dec(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=sequence_lengths, | |||
dtype=tf.float32) | |||
x = tf.concat(outputs, axis=-1) | |||
return x | |||
def conv_dec(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
embedded_inputs_speaker2, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker2], axis=2) | |||
return x | |||
def conv_and_ulstm(inputs, | |||
sequence_lengths, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
is_training, | |||
scope='conv_and_ulstm'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
scope='conv1d_{}'.format(i)) | |||
outputs, states = tf.nn.dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=sequence_lengths, | |||
dtype=tf.float32) | |||
return outputs | |||
def conv1d(inputs, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
mask=None, | |||
scope='conv1d'): | |||
with tf.variable_scope(scope): | |||
if mask is not None: | |||
inputs = inputs * tf.expand_dims(mask, -1) | |||
x = tf.layers.conv1d( | |||
inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
if mask is not None: | |||
x = x * tf.expand_dims(mask, -1) | |||
x = tf.layers.batch_normalization(x, training=is_training) | |||
if activation is not None: | |||
x = activation(x) | |||
if dropout: | |||
x = tf.layers.dropout(x, rate=0.5, training=is_training) | |||
return x | |||
def conv1d_dp(inputs, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=None, | |||
dropout=False, | |||
dropoutrate=0.5, | |||
mask=None, | |||
scope='conv1d'): | |||
with tf.variable_scope(scope): | |||
if mask is not None: | |||
inputs = inputs * tf.expand_dims(mask, -1) | |||
x = tf.layers.conv1d( | |||
inputs, filters=filters, kernel_size=kernel_size, padding='same') | |||
if mask is not None: | |||
x = x * tf.expand_dims(mask, -1) | |||
x = tf.contrib.layers.layer_norm(x) | |||
if activation is not None: | |||
x = activation(x) | |||
if dropout: | |||
x = tf.layers.dropout(x, rate=dropoutrate, training=is_training) | |||
return x | |||
def duration_predictor(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
lstm_units, | |||
input_lengths, | |||
is_training, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='duration_predictor'): | |||
with tf.variable_scope(scope): | |||
x = inputs | |||
for i in range(n_conv_layers): | |||
x = conv1d_dp( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
dropoutrate=0.1, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.concat([x, embedded_inputs_speaker], axis=2) | |||
outputs, states = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(lstm_units), | |||
LSTMBlockCell(lstm_units), | |||
x, | |||
sequence_length=input_lengths, | |||
dtype=tf.float32) | |||
x = tf.concat(outputs, axis=-1) | |||
x = tf.layers.dense(x, units=1) | |||
x = tf.nn.relu(x) | |||
return x | |||
def duration_predictor2(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
input_lengths, | |||
is_training, | |||
mask=None, | |||
scope='duration_predictor'): | |||
with tf.variable_scope(scope): | |||
x = inputs | |||
for i in range(n_conv_layers): | |||
x = conv1d_dp( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
dropoutrate=0.1, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
x = tf.layers.dense(x, units=1) | |||
x = tf.nn.relu(x) | |||
return x | |||
def conv_prenet(inputs, | |||
n_conv_layers, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
mask=None, | |||
scope='conv_prenet'): | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
x, | |||
filters, | |||
kernel_size, | |||
is_training, | |||
activation=tf.nn.relu, | |||
dropout=True, | |||
mask=mask, | |||
scope='conv1d_{}'.format(i)) | |||
return x |
@@ -0,0 +1,174 @@ | |||
"""Define position encoder classes.""" | |||
import abc | |||
import math | |||
import tensorflow as tf | |||
from .reducer import SumReducer | |||
class PositionEncoder(tf.keras.layers.Layer): | |||
"""Base class for position encoders.""" | |||
def __init__(self, reducer=None, **kwargs): | |||
"""Initializes the position encoder. | |||
Args: | |||
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
**kwargs: Additional layer keyword arguments. | |||
""" | |||
super(PositionEncoder, self).__init__(**kwargs) | |||
if reducer is None: | |||
reducer = SumReducer(dtype=kwargs.get('dtype')) | |||
self.reducer = reducer | |||
def call(self, inputs, position=None): # pylint: disable=arguments-differ | |||
"""Add position encodings to :obj:`inputs`. | |||
Args: | |||
inputs: The inputs to encode. | |||
position: The single position to encode, to use when this layer is called | |||
step by step. | |||
Returns: | |||
A ``tf.Tensor`` whose shape depends on the configured ``reducer``. | |||
""" | |||
batch_size = tf.shape(inputs)[0] | |||
timesteps = tf.shape(inputs)[1] | |||
input_dim = inputs.shape[-1].value | |||
positions = tf.range(timesteps) + 1 if position is None else [position] | |||
position_encoding = self._encode([positions], input_dim) | |||
position_encoding = tf.tile(position_encoding, [batch_size, 1, 1]) | |||
return self.reducer([inputs, position_encoding]) | |||
@abc.abstractmethod | |||
def _encode(self, positions, depth): | |||
"""Creates position encodings. | |||
Args: | |||
positions: The positions to encode of shape :math:`[B, ...]`. | |||
depth: The encoding depth :math:`D`. | |||
Returns: | |||
A ``tf.Tensor`` of shape :math:`[B, ..., D]`. | |||
""" | |||
raise NotImplementedError() | |||
class PositionEmbedder(PositionEncoder): | |||
"""Encodes position with a lookup table.""" | |||
def __init__(self, maximum_position=128, reducer=None, **kwargs): | |||
"""Initializes the position encoder. | |||
Args: | |||
maximum_position: The maximum position to embed. Positions greater | |||
than this value will be set to :obj:`maximum_position`. | |||
reducer: A :class:`opennmt.layers.Reducer` to merge inputs and position | |||
encodings. Defaults to :class:`opennmt.layers.SumReducer`. | |||
**kwargs: Additional layer keyword arguments. | |||
""" | |||
super(PositionEmbedder, self).__init__(reducer=reducer, **kwargs) | |||
self.maximum_position = maximum_position | |||
self.embedding = None | |||
def build(self, input_shape): | |||
shape = [self.maximum_position + 1, input_shape[-1]] | |||
self.embedding = self.add_weight('position_embedding', shape) | |||
super(PositionEmbedder, self).build(input_shape) | |||
def _encode(self, positions, depth): | |||
positions = tf.minimum(positions, self.maximum_position) | |||
return tf.nn.embedding_lookup(self.embedding, positions) | |||
class SinusoidalPositionEncoder(PositionEncoder): | |||
"""Encodes positions with sine waves as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def _encode(self, positions, depth): | |||
if depth % 2 != 0: | |||
raise ValueError( | |||
'SinusoidalPositionEncoder expects the depth to be divisble ' | |||
'by 2 but got %d' % depth) | |||
batch_size = tf.shape(positions)[0] | |||
positions = tf.cast(positions, tf.float32) | |||
log_timescale_increment = math.log(10000) / (depth / 2 - 1) | |||
inv_timescales = tf.exp( | |||
tf.range(depth / 2, dtype=tf.float32) * -log_timescale_increment) | |||
inv_timescales = tf.reshape( | |||
tf.tile(inv_timescales, [batch_size]), [batch_size, depth // 2]) | |||
scaled_time = tf.expand_dims(positions, -1) * tf.expand_dims( | |||
inv_timescales, 1) | |||
encoding = tf.concat( | |||
[tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) | |||
return tf.cast(encoding, self.dtype) | |||
class SinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
def __init__(self, name='SinusodalPositionalEncoding'): | |||
super(SinusodalPositionalEncoding, self).__init__(name=name) | |||
@staticmethod | |||
def positional_encoding(len, dim, step=1.): | |||
""" | |||
:param len: int scalar | |||
:param dim: int scalar | |||
:param step: | |||
:return: position embedding | |||
""" | |||
pos_mat = tf.tile( | |||
tf.expand_dims( | |||
tf.range(0, tf.cast(len, dtype=tf.float32), dtype=tf.float32) | |||
* step, | |||
axis=-1), [1, dim]) | |||
dim_mat = tf.tile( | |||
tf.expand_dims( | |||
tf.range(0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
axis=0), [len, 1]) | |||
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
pos_encoding = tf.where( # [time, dims] | |||
tf.math.equal(tf.math.mod(dim_mat_int, 2), 0), | |||
x=tf.math.sin( | |||
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
y=tf.math.cos(pos_mat | |||
/ tf.pow(10000., | |||
(dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
return pos_encoding | |||
class BatchSinusodalPositionalEncoding(tf.keras.layers.Layer): | |||
def __init__(self, name='BatchSinusodalPositionalEncoding'): | |||
super(BatchSinusodalPositionalEncoding, self).__init__(name=name) | |||
@staticmethod | |||
def positional_encoding(batch_size, len, dim, pos_mat, step=1.): | |||
""" | |||
:param len: int scalar | |||
:param dim: int scalar | |||
:param step: | |||
:param pos_mat: [B, len] = [len, 1] * dim | |||
:return: position embedding | |||
""" | |||
pos_mat = tf.tile( | |||
tf.expand_dims(tf.cast(pos_mat, dtype=tf.float32) * step, axis=-1), | |||
[1, 1, dim]) # [B, len, dim] | |||
dim_mat = tf.tile( | |||
tf.expand_dims( | |||
tf.expand_dims( | |||
tf.range( | |||
0, tf.cast(dim, dtype=tf.float32), dtype=tf.float32), | |||
axis=0), | |||
axis=0), [batch_size, len, 1]) # [B, len, dim] | |||
dim_mat_int = tf.cast(dim_mat, dtype=tf.int32) | |||
pos_encoding = tf.where( # [B, time, dims] | |||
tf.math.equal(tf.mod(dim_mat_int, 2), 0), | |||
x=tf.math.sin( | |||
pos_mat / tf.pow(10000., dim_mat / tf.cast(dim, tf.float32))), | |||
y=tf.math.cos(pos_mat | |||
/ tf.pow(10000., | |||
(dim_mat - 1) / tf.cast(dim, tf.float32)))) | |||
return pos_encoding |
@@ -0,0 +1,155 @@ | |||
"""Define reducers: objects that merge inputs.""" | |||
import abc | |||
import functools | |||
import tensorflow as tf | |||
def pad_in_time(x, padding_length): | |||
"""Helper function to pad a tensor in the time dimension and retain the static depth dimension.""" | |||
return tf.pad(x, [[0, 0], [0, padding_length], [0, 0]]) | |||
def align_in_time(x, length): | |||
"""Aligns the time dimension of :obj:`x` with :obj:`length`.""" | |||
time_dim = tf.shape(x)[1] | |||
return tf.cond( | |||
tf.less(time_dim, length), | |||
true_fn=lambda: pad_in_time(x, length - time_dim), | |||
false_fn=lambda: x[:, :length]) | |||
def pad_with_identity(x, | |||
sequence_length, | |||
max_sequence_length, | |||
identity_values=0, | |||
maxlen=None): | |||
"""Pads a tensor with identity values up to :obj:`max_sequence_length`. | |||
Args: | |||
x: A ``tf.Tensor`` of shape ``[batch_size, time, depth]``. | |||
sequence_length: The true sequence length of :obj:`x`. | |||
max_sequence_length: The sequence length up to which the tensor must contain | |||
:obj:`identity values`. | |||
identity_values: The identity value. | |||
maxlen: Size of the output time dimension. Default is the maximum value in | |||
obj:`max_sequence_length`. | |||
Returns: | |||
A ``tf.Tensor`` of shape ``[batch_size, maxlen, depth]``. | |||
""" | |||
if maxlen is None: | |||
maxlen = tf.reduce_max(max_sequence_length) | |||
mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
mask = tf.expand_dims(mask, axis=-1) | |||
mask_combined = tf.sequence_mask( | |||
max_sequence_length, maxlen=maxlen, dtype=x.dtype) | |||
mask_combined = tf.expand_dims(mask_combined, axis=-1) | |||
identity_mask = mask_combined * (1.0 - mask) | |||
x = pad_in_time(x, maxlen - tf.shape(x)[1]) | |||
x = x * mask + (identity_mask * identity_values) | |||
return x | |||
def pad_n_with_identity(inputs, sequence_lengths, identity_values=0): | |||
"""Pads each input tensors with identity values up to | |||
``max(sequence_lengths)`` for each batch. | |||
Args: | |||
inputs: A list of ``tf.Tensor``. | |||
sequence_lengths: A list of sequence length. | |||
identity_values: The identity value. | |||
Returns: | |||
A tuple ``(padded, max_sequence_length)`` which are respectively a list of | |||
``tf.Tensor`` where each tensor are padded with identity and the combined | |||
sequence length. | |||
""" | |||
max_sequence_length = tf.reduce_max(sequence_lengths, axis=0) | |||
maxlen = tf.reduce_max([tf.shape(x)[1] for x in inputs]) | |||
padded = [ | |||
pad_with_identity( | |||
x, | |||
length, | |||
max_sequence_length, | |||
identity_values=identity_values, | |||
maxlen=maxlen) for x, length in zip(inputs, sequence_lengths) | |||
] | |||
return padded, max_sequence_length | |||
class Reducer(tf.keras.layers.Layer): | |||
"""Base class for reducers.""" | |||
def zip_and_reduce(self, x, y): | |||
"""Zips the :obj:`x` with :obj:`y` structures together and reduces all | |||
elements. If the structures are nested, they will be flattened first. | |||
Args: | |||
x: The first structure. | |||
y: The second structure. | |||
Returns: | |||
The same structure as :obj:`x` and :obj:`y` where each element from | |||
:obj:`x` is reduced with the correspond element from :obj:`y`. | |||
Raises: | |||
ValueError: if the two structures are not the same. | |||
""" | |||
tf.nest.assert_same_structure(x, y) | |||
x_flat = tf.nest.flatten(x) | |||
y_flat = tf.nest.flatten(y) | |||
reduced = list(map(self, zip(x_flat, y_flat))) | |||
return tf.nest.pack_sequence_as(x, reduced) | |||
def call(self, inputs, sequence_length=None): # pylint: disable=arguments-differ | |||
"""Reduces all input elements. | |||
Args: | |||
inputs: A list of ``tf.Tensor``. | |||
sequence_length: The length of each input, if reducing sequences. | |||
Returns: | |||
If :obj:`sequence_length` is set, a tuple | |||
``(reduced_input, reduced_length)``, otherwise a reduced ``tf.Tensor`` | |||
only. | |||
""" | |||
if sequence_length is None: | |||
return self.reduce(inputs) | |||
else: | |||
return self.reduce_sequence( | |||
inputs, sequence_lengths=sequence_length) | |||
@abc.abstractmethod | |||
def reduce(self, inputs): | |||
"""See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
raise NotImplementedError() | |||
@abc.abstractmethod | |||
def reduce_sequence(self, inputs, sequence_lengths): | |||
"""See :meth:`opennmt.layers.Reducer.__call__`.""" | |||
raise NotImplementedError() | |||
class SumReducer(Reducer): | |||
"""A reducer that sums the inputs.""" | |||
def reduce(self, inputs): | |||
if len(inputs) == 1: | |||
return inputs[0] | |||
if len(inputs) == 2: | |||
return inputs[0] + inputs[1] | |||
return tf.add_n(inputs) | |||
def reduce_sequence(self, inputs, sequence_lengths): | |||
padded, combined_length = pad_n_with_identity( | |||
inputs, sequence_lengths, identity_values=0) | |||
return self.reduce(padded), combined_length | |||
class MultiplyReducer(Reducer): | |||
"""A reducer that multiplies the inputs.""" | |||
def reduce(self, inputs): | |||
return functools.reduce(lambda a, x: a * x, inputs) | |||
def reduce_sequence(self, inputs, sequence_lengths): | |||
padded, combined_length = pad_n_with_identity( | |||
inputs, sequence_lengths, identity_values=1) | |||
return self.reduce(padded), combined_length |
@@ -0,0 +1,240 @@ | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.rnn import RNNCell | |||
from tensorflow.contrib.seq2seq import AttentionWrapperState | |||
from tensorflow.python.ops import rnn_cell_impl | |||
from .modules import prenet | |||
class VarPredictorCell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(VarPredictorCell, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._dim | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = inputs[:, 0:self._dim] | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='var_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._dim) | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states | |||
class DurPredictorCell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(DurPredictorCell, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._dim | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = inputs[:, 0:self._dim] | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='dur_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._dim) | |||
new_super_cell_out = tf.nn.relu(new_super_cell_out) | |||
# new_super_cell_out = tf.log(tf.cast(tf.round(tf.exp(new_super_cell_out) - 1), tf.float32) + 1) | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states | |||
class DurPredictorCECell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | |||
max_dur, dur_embedding_dim): | |||
super(DurPredictorCECell, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
self._max_dur = max_dur | |||
self._dur_embedding_dim = dur_embedding_dim | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._max_dur | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = tf.squeeze( | |||
tf.cast(inputs[:, 0:self._dim], tf.int32), axis=-1) # [N] | |||
prenet_input = tf.one_hot( | |||
prenet_input, self._max_dur, on_value=1.0, off_value=0.0, | |||
axis=-1) # [N, 120] | |||
prenet_input = tf.layers.dense( | |||
prenet_input, units=self._dur_embedding_dim) | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='dur_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._max_dur) # [N, 120] | |||
new_super_cell_out = tf.nn.softmax(new_super_cell_out) # [N, 120] | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states | |||
class VarPredictorCell2(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(VarPredictorCell2, self).__init__() | |||
self._var_predictor_cell = var_predictor_cell | |||
self._is_training = is_training | |||
self._dim = dim | |||
self._prenet_units = prenet_units | |||
@property | |||
def state_size(self): | |||
return tuple([self.output_size, self._var_predictor_cell.state_size]) | |||
@property | |||
def output_size(self): | |||
return self._dim | |||
def zero_state(self, batch_size, dtype): | |||
return tuple([ | |||
rnn_cell_impl._zero_state_tensors(self.output_size, batch_size, | |||
dtype), | |||
self._var_predictor_cell.zero_state(batch_size, dtype) | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
super_cell_out, decoder_state = state | |||
# split | |||
prenet_input = inputs[:, 0:self._dim] | |||
encoder_output = inputs[:, self._dim:] | |||
# prenet and concat | |||
prenet_output = prenet( | |||
prenet_input, | |||
self._prenet_units, | |||
self._is_training, | |||
scope='var_prenet') | |||
decoder_input = tf.concat([prenet_output, encoder_output], axis=-1) | |||
# decoder LSTM/GRU | |||
new_super_cell_out, new_decoder_state = self._var_predictor_cell( | |||
decoder_input, decoder_state) | |||
# projection | |||
new_super_cell_out = tf.layers.dense( | |||
new_super_cell_out, units=self._dim) | |||
# split and relu | |||
new_super_cell_out = tf.concat([ | |||
tf.nn.relu(new_super_cell_out[:, 0:1]), new_super_cell_out[:, 1:] | |||
], axis=-1) # yapf:disable | |||
new_states = tuple([new_super_cell_out, new_decoder_state]) | |||
return new_super_cell_out, new_states |
@@ -0,0 +1,760 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
from tensorflow.contrib.seq2seq import BasicDecoder | |||
from tensorflow.python.ops.ragged.ragged_util import repeat | |||
from .fsmn_encoder import FsmnEncoderV2 | |||
from .helpers import VarTestHelper, VarTrainingHelper | |||
from .modules import conv_prenet, decoder_prenet, encoder_prenet | |||
from .position import (BatchSinusodalPositionalEncoding, | |||
SinusodalPositionalEncoding) | |||
from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||
from .self_attention_decoder import SelfAttentionDecoder | |||
from .self_attention_encoder import SelfAttentionEncoder | |||
class RobuTrans(): | |||
def __init__(self, hparams): | |||
self._hparams = hparams | |||
def initialize(self, | |||
inputs, | |||
inputs_emotion, | |||
inputs_speaker, | |||
input_lengths, | |||
output_lengths=None, | |||
mel_targets=None, | |||
durations=None, | |||
pitch_contours=None, | |||
uv_masks=None, | |||
pitch_scales=None, | |||
duration_scales=None, | |||
energy_contours=None, | |||
energy_scales=None): | |||
'''Initializes the model for inference. | |||
Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | |||
Args: | |||
inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of | |||
steps in the input time series, and values are character IDs | |||
input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
of each sequence in inputs. | |||
output_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths | |||
of each sequence in outputs. | |||
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | |||
of steps in the output time series, M is num_mels, and values are entries in the mel | |||
spectrogram. Only needed for training. | |||
''' | |||
with tf.variable_scope('inference') as _: | |||
is_training = mel_targets is not None | |||
batch_size = tf.shape(inputs)[0] | |||
hp = self._hparams | |||
input_mask = None | |||
if input_lengths is not None and is_training: | |||
input_mask = tf.sequence_mask( | |||
input_lengths, tf.shape(inputs)[1], dtype=tf.float32) | |||
if input_mask is not None: | |||
inputs = inputs * tf.expand_dims(input_mask, -1) | |||
# speaker embedding | |||
embedded_inputs_speaker = tf.layers.dense( | |||
inputs_speaker, | |||
32, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
# emotion embedding | |||
embedded_inputs_emotion = tf.layers.dense( | |||
inputs_emotion, | |||
32, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer(stddev=0.5)) | |||
# symbol embedding | |||
with tf.variable_scope('Embedding'): | |||
embedded_inputs = tf.layers.dense( | |||
inputs, | |||
hp.embedding_dim, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer( | |||
stddev=0.5)) | |||
# Encoder | |||
with tf.variable_scope('Encoder'): | |||
Encoder = SelfAttentionEncoder( | |||
num_layers=hp.encoder_num_layers, | |||
num_units=hp.encoder_num_units, | |||
num_heads=hp.encoder_num_heads, | |||
ffn_inner_dim=hp.encoder_ffn_inner_dim, | |||
dropout=hp.encoder_dropout, | |||
attention_dropout=hp.encoder_attention_dropout, | |||
relu_dropout=hp.encoder_relu_dropout) | |||
encoder_outputs, state_mo, sequence_length_mo, attns = Encoder.encode( | |||
embedded_inputs, | |||
sequence_length=input_lengths, | |||
mode=is_training) | |||
encoder_outputs = tf.layers.dense( | |||
encoder_outputs, | |||
hp.encoder_projection_units, | |||
activation=None, | |||
use_bias=False, | |||
kernel_initializer=tf.truncated_normal_initializer( | |||
stddev=0.5)) | |||
# pitch and energy | |||
var_inputs = tf.concat([ | |||
encoder_outputs, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2) | |||
if input_mask is not None: | |||
var_inputs = var_inputs * tf.expand_dims(input_mask, -1) | |||
with tf.variable_scope('Pitch_Predictor'): | |||
Pitch_Predictor_FSMN = FsmnEncoderV2( | |||
filter_size=hp.predictor_filter_size, | |||
fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
dnn_num_layers=hp.predictor_dnn_num_layers, | |||
num_memory_units=hp.predictor_num_memory_units, | |||
ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
dropout=hp.predictor_dropout, | |||
shift=hp.predictor_shift, | |||
position_encoder=None) | |||
pitch_contour_outputs, _, _ = Pitch_Predictor_FSMN.encode( | |||
tf.concat([ | |||
encoder_outputs, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2), | |||
sequence_length=input_lengths, | |||
mode=is_training) | |||
pitch_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
pitch_contour_outputs, | |||
sequence_length=input_lengths, | |||
dtype=tf.float32) | |||
pitch_contour_outputs = tf.concat( | |||
pitch_contour_outputs, axis=-1) | |||
pitch_contour_outputs = tf.layers.dense( | |||
pitch_contour_outputs, units=1) # [N, T_in, 1] | |||
pitch_contour_outputs = tf.squeeze( | |||
pitch_contour_outputs, axis=2) # [N, T_in] | |||
with tf.variable_scope('Energy_Predictor'): | |||
Energy_Predictor_FSMN = FsmnEncoderV2( | |||
filter_size=hp.predictor_filter_size, | |||
fsmn_num_layers=hp.predictor_fsmn_num_layers, | |||
dnn_num_layers=hp.predictor_dnn_num_layers, | |||
num_memory_units=hp.predictor_num_memory_units, | |||
ffn_inner_dim=hp.predictor_ffn_inner_dim, | |||
dropout=hp.predictor_dropout, | |||
shift=hp.predictor_shift, | |||
position_encoder=None) | |||
energy_contour_outputs, _, _ = Energy_Predictor_FSMN.encode( | |||
tf.concat([ | |||
encoder_outputs, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2), | |||
sequence_length=input_lengths, | |||
mode=is_training) | |||
energy_contour_outputs, _ = tf.nn.bidirectional_dynamic_rnn( | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
energy_contour_outputs, | |||
sequence_length=input_lengths, | |||
dtype=tf.float32) | |||
energy_contour_outputs = tf.concat( | |||
energy_contour_outputs, axis=-1) | |||
energy_contour_outputs = tf.layers.dense( | |||
energy_contour_outputs, units=1) # [N, T_in, 1] | |||
energy_contour_outputs = tf.squeeze( | |||
energy_contour_outputs, axis=2) # [N, T_in] | |||
if is_training: | |||
pitch_embeddings = tf.expand_dims( | |||
pitch_contours, axis=2) # [N, T_in, 1] | |||
pitch_embeddings = tf.layers.conv1d( | |||
pitch_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='pitch_embeddings') # [N, T_in, 32] | |||
energy_embeddings = tf.expand_dims( | |||
energy_contours, axis=2) # [N, T_in, 1] | |||
energy_embeddings = tf.layers.conv1d( | |||
energy_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='energy_embeddings') # [N, T_in, 32] | |||
else: | |||
pitch_contour_outputs *= pitch_scales | |||
pitch_embeddings = tf.expand_dims( | |||
pitch_contour_outputs, axis=2) # [N, T_in, 1] | |||
pitch_embeddings = tf.layers.conv1d( | |||
pitch_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='pitch_embeddings') # [N, T_in, 32] | |||
energy_contour_outputs *= energy_scales | |||
energy_embeddings = tf.expand_dims( | |||
energy_contour_outputs, axis=2) # [N, T_in, 1] | |||
energy_embeddings = tf.layers.conv1d( | |||
energy_embeddings, | |||
filters=hp.encoder_projection_units, | |||
kernel_size=9, | |||
padding='same', | |||
name='energy_embeddings') # [N, T_in, 32] | |||
encoder_outputs_ = encoder_outputs + pitch_embeddings + energy_embeddings | |||
# duration | |||
dur_inputs = tf.concat([ | |||
encoder_outputs_, embedded_inputs_speaker, | |||
embedded_inputs_emotion | |||
], 2) | |||
if input_mask is not None: | |||
dur_inputs = dur_inputs * tf.expand_dims(input_mask, -1) | |||
with tf.variable_scope('Duration_Predictor'): | |||
duration_predictor_cell = MultiRNNCell([ | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units) | |||
], state_is_tuple=True) # yapf:disable | |||
duration_output_cell = DurPredictorCell( | |||
duration_predictor_cell, is_training, 1, | |||
hp.predictor_prenet_units) | |||
duration_predictor_init_state = duration_output_cell.zero_state( | |||
batch_size=batch_size, dtype=tf.float32) | |||
if is_training: | |||
duration_helper = VarTrainingHelper( | |||
tf.expand_dims( | |||
tf.log(tf.cast(durations, tf.float32) + 1), | |||
axis=2), dur_inputs, 1) | |||
else: | |||
duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | |||
( | |||
duration_outputs, _ | |||
), final_duration_predictor_state, _ = tf.contrib.seq2seq.dynamic_decode( | |||
BasicDecoder(duration_output_cell, duration_helper, | |||
duration_predictor_init_state), | |||
maximum_iterations=1000) | |||
duration_outputs = tf.squeeze( | |||
duration_outputs, axis=2) # [N, T_in] | |||
if input_mask is not None: | |||
duration_outputs = duration_outputs * input_mask | |||
duration_outputs_ = tf.exp(duration_outputs) - 1 | |||
# Length Regulator | |||
with tf.variable_scope('Length_Regulator'): | |||
if is_training: | |||
i = tf.constant(1) | |||
# position embedding | |||
j = tf.constant(1) | |||
dur_len = tf.shape(durations)[-1] | |||
embedded_position_i = tf.range(1, durations[0, 0] + 1) | |||
def condition_pos(j, e): | |||
return tf.less(j, dur_len) | |||
def loop_body_pos(j, embedded_position_i): | |||
embedded_position_i = tf.concat([ | |||
embedded_position_i, | |||
tf.range(1, durations[0, j] + 1) | |||
], axis=0) # yapf:disable | |||
return [j + 1, embedded_position_i] | |||
j, embedded_position_i = tf.while_loop( | |||
condition_pos, | |||
loop_body_pos, [j, embedded_position_i], | |||
shape_invariants=[ | |||
j.get_shape(), | |||
tf.TensorShape([None]) | |||
]) | |||
embedded_position = tf.reshape(embedded_position_i, | |||
(1, -1)) | |||
# others | |||
LR_outputs = repeat( | |||
encoder_outputs_[0:1, :, :], durations[0, :], axis=1) | |||
embedded_outputs_speaker = repeat( | |||
embedded_inputs_speaker[0:1, :, :], | |||
durations[0, :], | |||
axis=1) | |||
embedded_outputs_emotion = repeat( | |||
embedded_inputs_emotion[0:1, :, :], | |||
durations[0, :], | |||
axis=1) | |||
def condition(i, pos, layer, s, e): | |||
return tf.less(i, tf.shape(mel_targets)[0]) | |||
def loop_body(i, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, | |||
embedded_outputs_emotion): | |||
# position embedding | |||
jj = tf.constant(1) | |||
embedded_position_i = tf.range(1, durations[i, 0] + 1) | |||
def condition_pos_i(j, e): | |||
return tf.less(j, dur_len) | |||
def loop_body_pos_i(j, embedded_position_i): | |||
embedded_position_i = tf.concat([ | |||
embedded_position_i, | |||
tf.range(1, durations[i, j] + 1) | |||
], axis=0) # yapf:disable | |||
return [j + 1, embedded_position_i] | |||
jj, embedded_position_i = tf.while_loop( | |||
condition_pos_i, | |||
loop_body_pos_i, [jj, embedded_position_i], | |||
shape_invariants=[ | |||
jj.get_shape(), | |||
tf.TensorShape([None]) | |||
]) | |||
embedded_position = tf.concat([ | |||
embedded_position, | |||
tf.reshape(embedded_position_i, (1, -1)) | |||
], 0) | |||
# others | |||
LR_outputs = tf.concat([ | |||
LR_outputs, | |||
repeat( | |||
encoder_outputs_[i:i + 1, :, :], | |||
durations[i, :], | |||
axis=1) | |||
], 0) | |||
embedded_outputs_speaker = tf.concat([ | |||
embedded_outputs_speaker, | |||
repeat( | |||
embedded_inputs_speaker[i:i + 1, :, :], | |||
durations[i, :], | |||
axis=1) | |||
], 0) | |||
embedded_outputs_emotion = tf.concat([ | |||
embedded_outputs_emotion, | |||
repeat( | |||
embedded_inputs_emotion[i:i + 1, :, :], | |||
durations[i, :], | |||
axis=1) | |||
], 0) | |||
return [ | |||
i + 1, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, embedded_outputs_emotion | |||
] | |||
i, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, | |||
embedded_outputs_emotion = tf.while_loop( | |||
condition, | |||
loop_body, [ | |||
i, embedded_position, LR_outputs, | |||
embedded_outputs_speaker, embedded_outputs_emotion | |||
], | |||
shape_invariants=[ | |||
i.get_shape(), | |||
tf.TensorShape([None, None]), | |||
tf.TensorShape([None, None, None]), | |||
tf.TensorShape([None, None, None]), | |||
tf.TensorShape([None, None, None]) | |||
], | |||
parallel_iterations=hp.batch_size) | |||
ori_framenum = tf.shape(mel_targets)[1] | |||
else: | |||
# position | |||
j = tf.constant(1) | |||
dur_len = tf.shape(duration_outputs_)[-1] | |||
embedded_position_i = tf.range( | |||
1, | |||
tf.cast(tf.round(duration_outputs_)[0, 0], tf.int32) | |||
+ 1) | |||
def condition_pos(j, e): | |||
return tf.less(j, dur_len) | |||
def loop_body_pos(j, embedded_position_i): | |||
embedded_position_i = tf.concat([ | |||
embedded_position_i, | |||
tf.range( | |||
1, | |||
tf.cast( | |||
tf.round(duration_outputs_)[0, j], | |||
tf.int32) + 1) | |||
], axis=0) # yapf:disable | |||
return [j + 1, embedded_position_i] | |||
j, embedded_position_i = tf.while_loop( | |||
condition_pos, | |||
loop_body_pos, [j, embedded_position_i], | |||
shape_invariants=[ | |||
j.get_shape(), | |||
tf.TensorShape([None]) | |||
]) | |||
embedded_position = tf.reshape(embedded_position_i, | |||
(1, -1)) | |||
# others | |||
duration_outputs_ *= duration_scales | |||
LR_outputs = repeat( | |||
encoder_outputs_[0:1, :, :], | |||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
axis=1) | |||
embedded_outputs_speaker = repeat( | |||
embedded_inputs_speaker[0:1, :, :], | |||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
axis=1) | |||
embedded_outputs_emotion = repeat( | |||
embedded_inputs_emotion[0:1, :, :], | |||
tf.cast(tf.round(duration_outputs_)[0, :], tf.int32), | |||
axis=1) | |||
ori_framenum = tf.shape(LR_outputs)[1] | |||
left = hp.outputs_per_step - tf.mod( | |||
ori_framenum, hp.outputs_per_step) | |||
LR_outputs = tf.cond( | |||
tf.equal(left, | |||
hp.outputs_per_step), lambda: LR_outputs, | |||
lambda: tf.pad(LR_outputs, [[0, 0], [0, left], [0, 0]], | |||
'CONSTANT')) | |||
embedded_outputs_speaker = tf.cond( | |||
tf.equal(left, hp.outputs_per_step), | |||
lambda: embedded_outputs_speaker, lambda: tf.pad( | |||
embedded_outputs_speaker, [[0, 0], [0, left], | |||
[0, 0]], 'CONSTANT')) | |||
embedded_outputs_emotion = tf.cond( | |||
tf.equal(left, hp.outputs_per_step), | |||
lambda: embedded_outputs_emotion, lambda: tf.pad( | |||
embedded_outputs_emotion, [[0, 0], [0, left], | |||
[0, 0]], 'CONSTANT')) | |||
embedded_position = tf.cond( | |||
tf.equal(left, hp.outputs_per_step), | |||
lambda: embedded_position, | |||
lambda: tf.pad(embedded_position, [[0, 0], [0, left]], | |||
'CONSTANT')) | |||
# Pos_Embedding | |||
with tf.variable_scope('Position_Embedding'): | |||
Pos_Embedding = BatchSinusodalPositionalEncoding() | |||
position_embeddings = Pos_Embedding.positional_encoding( | |||
batch_size, | |||
tf.shape(LR_outputs)[1], hp.encoder_projection_units, | |||
embedded_position) | |||
LR_outputs += position_embeddings | |||
# multi-frame | |||
LR_outputs = tf.reshape(LR_outputs, [ | |||
batch_size, -1, | |||
hp.outputs_per_step * hp.encoder_projection_units | |||
]) | |||
embedded_outputs_speaker = tf.reshape( | |||
embedded_outputs_speaker, | |||
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
embedded_outputs_emotion = tf.reshape( | |||
embedded_outputs_emotion, | |||
[batch_size, -1, hp.outputs_per_step * 32])[:, :, :32] | |||
# [N, T_out, D_LR_outputs] (D_LR_outputs = hp.outputs_per_step * hp.encoder_projection_units + 64) | |||
LR_outputs = tf.concat([ | |||
LR_outputs, embedded_outputs_speaker, embedded_outputs_emotion | |||
], -1) | |||
# auto bandwidth | |||
if is_training: | |||
durations_mask = tf.cast(durations, | |||
tf.float32) * input_mask # [N, T_in] | |||
else: | |||
durations_mask = duration_outputs_ | |||
X_band_width = tf.cast( | |||
tf.round(tf.reduce_max(durations_mask) / hp.outputs_per_step), | |||
tf.int32) | |||
H_band_width = X_band_width | |||
with tf.variable_scope('Decoder'): | |||
Decoder = SelfAttentionDecoder( | |||
num_layers=hp.decoder_num_layers, | |||
num_units=hp.decoder_num_units, | |||
num_heads=hp.decoder_num_heads, | |||
ffn_inner_dim=hp.decoder_ffn_inner_dim, | |||
dropout=hp.decoder_dropout, | |||
attention_dropout=hp.decoder_attention_dropout, | |||
relu_dropout=hp.decoder_relu_dropout, | |||
prenet_units=hp.prenet_units, | |||
dense_units=hp.prenet_proj_units, | |||
num_mels=hp.num_mels, | |||
outputs_per_step=hp.outputs_per_step, | |||
X_band_width=X_band_width, | |||
H_band_width=H_band_width, | |||
position_encoder=None) | |||
if is_training: | |||
if hp.free_run: | |||
r = hp.outputs_per_step | |||
init_decoder_input = tf.expand_dims( | |||
tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
axis=1) # [N, 1, hp.num_mels] | |||
decoder_input_lengths = tf.cast( | |||
output_lengths / r, tf.int32) | |||
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
init_decoder_input, | |||
maximum_iterations=tf.shape(LR_outputs)[1], | |||
mode=is_training, | |||
memory=LR_outputs, | |||
memory_sequence_length=decoder_input_lengths) | |||
else: | |||
r = hp.outputs_per_step | |||
decoder_input = mel_targets[:, r - 1:: | |||
r, :] # [N, T_out / r, hp.num_mels] | |||
init_decoder_input = tf.expand_dims( | |||
tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
axis=1) # [N, 1, hp.num_mels] | |||
decoder_input = tf.concat( | |||
[init_decoder_input, decoder_input], | |||
axis=1) # [N, T_out / r + 1, hp.num_mels] | |||
decoder_input = decoder_input[:, : | |||
-1, :] # [N, T_out / r, hp.num_mels] | |||
decoder_input_lengths = tf.cast( | |||
output_lengths / r, tf.int32) | |||
decoder_outputs, attention_x, attention_h = Decoder.decode_from_inputs( | |||
decoder_input, | |||
decoder_input_lengths, | |||
mode=is_training, | |||
memory=LR_outputs, | |||
memory_sequence_length=decoder_input_lengths) | |||
else: | |||
init_decoder_input = tf.expand_dims( | |||
tf.tile([[0.0]], [batch_size, hp.num_mels]), | |||
axis=1) # [N, 1, hp.num_mels] | |||
decoder_outputs, attention_x, attention_h = Decoder.dynamic_decode_and_search( | |||
init_decoder_input, | |||
maximum_iterations=tf.shape(LR_outputs)[1], | |||
mode=is_training, | |||
memory=LR_outputs, | |||
memory_sequence_length=tf.expand_dims( | |||
tf.shape(LR_outputs)[1], axis=0)) | |||
if is_training: | |||
mel_outputs_ = tf.reshape(decoder_outputs, | |||
[batch_size, -1, hp.num_mels]) | |||
else: | |||
mel_outputs_ = tf.reshape( | |||
decoder_outputs, | |||
[batch_size, -1, hp.num_mels])[:, :ori_framenum, :] | |||
mel_outputs = mel_outputs_ | |||
with tf.variable_scope('Postnet'): | |||
Postnet_FSMN = FsmnEncoderV2( | |||
filter_size=hp.postnet_filter_size, | |||
fsmn_num_layers=hp.postnet_fsmn_num_layers, | |||
dnn_num_layers=hp.postnet_dnn_num_layers, | |||
num_memory_units=hp.postnet_num_memory_units, | |||
ffn_inner_dim=hp.postnet_ffn_inner_dim, | |||
dropout=hp.postnet_dropout, | |||
shift=hp.postnet_shift, | |||
position_encoder=None) | |||
if is_training: | |||
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
mel_outputs, | |||
sequence_length=output_lengths, | |||
mode=is_training) | |||
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
LSTMBlockCell(hp.postnet_lstm_units), | |||
postnet_fsmn_outputs, | |||
sequence_length=output_lengths, | |||
dtype=tf.float32) | |||
else: | |||
postnet_fsmn_outputs, _, _ = Postnet_FSMN.encode( | |||
mel_outputs, | |||
sequence_length=[tf.shape(mel_outputs_)[1]], | |||
mode=is_training) | |||
hidden_lstm_outputs, _ = tf.nn.dynamic_rnn( | |||
LSTMBlockCell(hp.postnet_lstm_units), | |||
postnet_fsmn_outputs, | |||
sequence_length=[tf.shape(mel_outputs_)[1]], | |||
dtype=tf.float32) | |||
mel_residual_outputs = tf.layers.dense( | |||
hidden_lstm_outputs, units=hp.num_mels) | |||
mel_outputs += mel_residual_outputs | |||
self.inputs = inputs | |||
self.inputs_speaker = inputs_speaker | |||
self.inputs_emotion = inputs_emotion | |||
self.input_lengths = input_lengths | |||
self.durations = durations | |||
self.output_lengths = output_lengths | |||
self.mel_outputs_ = mel_outputs_ | |||
self.mel_outputs = mel_outputs | |||
self.mel_targets = mel_targets | |||
self.duration_outputs = duration_outputs | |||
self.duration_outputs_ = duration_outputs_ | |||
self.duration_scales = duration_scales | |||
self.pitch_contour_outputs = pitch_contour_outputs | |||
self.pitch_contours = pitch_contours | |||
self.pitch_scales = pitch_scales | |||
self.energy_contour_outputs = energy_contour_outputs | |||
self.energy_contours = energy_contours | |||
self.energy_scales = energy_scales | |||
self.uv_masks_ = uv_masks | |||
self.embedded_inputs_emotion = embedded_inputs_emotion | |||
self.embedding_fsmn_outputs = embedded_inputs | |||
self.encoder_outputs = encoder_outputs | |||
self.encoder_outputs_ = encoder_outputs_ | |||
self.LR_outputs = LR_outputs | |||
self.postnet_fsmn_outputs = postnet_fsmn_outputs | |||
self.pitch_embeddings = pitch_embeddings | |||
self.energy_embeddings = energy_embeddings | |||
self.attns = attns | |||
self.attention_x = attention_x | |||
self.attention_h = attention_h | |||
self.X_band_width = X_band_width | |||
self.H_band_width = H_band_width | |||
def add_loss(self): | |||
'''Adds loss to the model. Sets "loss" field. initialize must have been called.''' | |||
with tf.variable_scope('loss') as _: | |||
hp = self._hparams | |||
mask = tf.sequence_mask( | |||
self.output_lengths, | |||
tf.shape(self.mel_targets)[1], | |||
dtype=tf.float32) | |||
valid_outputs = tf.reduce_sum(mask) | |||
mask_input = tf.sequence_mask( | |||
self.input_lengths, | |||
tf.shape(self.durations)[1], | |||
dtype=tf.float32) | |||
valid_inputs = tf.reduce_sum(mask_input) | |||
# mel loss | |||
if self.uv_masks_ is not None: | |||
valid_outputs_mask = tf.reduce_sum( | |||
tf.expand_dims(mask, -1) * self.uv_masks_) | |||
self.mel_loss_ = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs_) | |||
* tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
valid_outputs_mask * hp.num_mels) | |||
self.mel_loss = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs) | |||
* tf.expand_dims(mask, -1) * self.uv_masks_) / ( | |||
valid_outputs_mask * hp.num_mels) | |||
else: | |||
self.mel_loss_ = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs_) | |||
* tf.expand_dims(mask, -1)) / ( | |||
valid_outputs * hp.num_mels) | |||
self.mel_loss = tf.reduce_sum( | |||
tf.abs(self.mel_targets - self.mel_outputs) | |||
* tf.expand_dims(mask, -1)) / ( | |||
valid_outputs * hp.num_mels) | |||
# duration loss | |||
self.duration_loss = tf.reduce_sum( | |||
tf.abs( | |||
tf.log(tf.cast(self.durations, tf.float32) + 1) | |||
- self.duration_outputs) * mask_input) / valid_inputs | |||
# pitch contour loss | |||
self.pitch_contour_loss = tf.reduce_sum( | |||
tf.abs(self.pitch_contours - self.pitch_contour_outputs) | |||
* mask_input) / valid_inputs | |||
# energy contour loss | |||
self.energy_contour_loss = tf.reduce_sum( | |||
tf.abs(self.energy_contours - self.energy_contour_outputs) | |||
* mask_input) / valid_inputs | |||
# final loss | |||
self.loss = self.mel_loss_ + self.mel_loss + self.duration_loss \ | |||
+ self.pitch_contour_loss + self.energy_contour_loss | |||
# guided attention loss | |||
self.guided_attention_loss = tf.constant(0.0) | |||
if hp.guided_attention: | |||
i0 = tf.constant(0) | |||
loss0 = tf.constant(0.0) | |||
def c(i, _): | |||
return tf.less(i, tf.shape(mel_targets)[0]) | |||
def loop_body(i, loss): | |||
decoder_input_lengths = tf.cast( | |||
self.output_lengths / hp.outputs_per_step, tf.int32) | |||
input_len = decoder_input_lengths[i] | |||
output_len = decoder_input_lengths[i] | |||
input_w = tf.expand_dims( | |||
tf.range(tf.cast(input_len, dtype=tf.float32)), | |||
axis=1) / tf.cast( | |||
input_len, dtype=tf.float32) # [T_in, 1] | |||
output_w = tf.expand_dims( | |||
tf.range(tf.cast(output_len, dtype=tf.float32)), | |||
axis=0) / tf.cast( | |||
output_len, dtype=tf.float32) # [1, T_out] | |||
guided_attention_w = 1.0 - tf.exp( | |||
-(1 / hp.guided_attention_2g_squared) | |||
* tf.square(input_w - output_w)) # [T_in, T_out] | |||
guided_attention_w = tf.expand_dims( | |||
guided_attention_w, axis=0) # [1, T_in, T_out] | |||
# [hp.decoder_num_heads, T_in, T_out] | |||
guided_attention_w = tf.tile(guided_attention_w, | |||
[hp.decoder_num_heads, 1, 1]) | |||
loss_i = tf.constant(0.0) | |||
for j in range(hp.decoder_num_layers): | |||
loss_i += tf.reduce_mean( | |||
self.attention_h[j][i, :, :input_len, :output_len] | |||
* guided_attention_w) | |||
return [tf.add(i, 1), tf.add(loss, loss_i)] | |||
_, loss = tf.while_loop( | |||
c, | |||
loop_body, | |||
loop_vars=[i0, loss0], | |||
parallel_iterations=hp.batch_size) | |||
self.guided_attention_loss = loss / hp.batch_size | |||
self.loss += hp.guided_attention_loss_weight * self.guided_attention_loss | |||
def add_optimizer(self, global_step): | |||
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. | |||
Args: | |||
global_step: int32 scalar Tensor representing current global step in training | |||
''' | |||
with tf.variable_scope('optimizer') as _: | |||
hp = self._hparams | |||
if hp.decay_learning_rate: | |||
self.learning_rate = _learning_rate_decay( | |||
hp.initial_learning_rate, global_step) | |||
else: | |||
self.learning_rate = tf.convert_to_tensor( | |||
hp.initial_learning_rate) | |||
optimizer = tf.train.AdamOptimizer(self.learning_rate, | |||
hp.adam_beta1, hp.adam_beta2) | |||
gradients, variables = zip(*optimizer.compute_gradients(self.loss)) | |||
self.gradients = gradients | |||
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) | |||
# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: | |||
# https://github.com/tensorflow/tensorflow/issues/1122 | |||
with tf.control_dependencies( | |||
tf.get_collection(tf.GraphKeys.UPDATE_OPS)): | |||
self.optimize = optimizer.apply_gradients( | |||
zip(clipped_gradients, variables), global_step=global_step) | |||
def _learning_rate_decay(init_lr, global_step): | |||
# Noam scheme from tensor2tensor: | |||
warmup_steps = 4000.0 | |||
step = tf.cast(global_step + 1, dtype=tf.float32) | |||
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, | |||
step**-0.5) |
@@ -0,0 +1,817 @@ | |||
"""Define self-attention decoder.""" | |||
import sys | |||
import tensorflow as tf | |||
from . import compat, transformer | |||
from .modules import decoder_prenet | |||
from .position import SinusoidalPositionEncoder | |||
class SelfAttentionDecoder(): | |||
"""Decoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
prenet_units=256, | |||
dense_units=128, | |||
num_mels=80, | |||
outputs_per_step=3, | |||
X_band_width=None, | |||
H_band_width=None, | |||
position_encoder=SinusoidalPositionEncoder(), | |||
self_attention_type='scaled_dot'): | |||
"""Initializes the parameters of the decoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
insensitive). | |||
Raises: | |||
ValueError: if :obj:`self_attention_type` is invalid. | |||
""" | |||
super(SelfAttentionDecoder, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
self.self_attention_type = self_attention_type.lower() | |||
if self.self_attention_type not in ('scaled_dot', 'average'): | |||
raise ValueError('invalid attention type %s' | |||
% self.self_attention_type) | |||
if self.self_attention_type == 'average': | |||
tf.logging.warning( | |||
'Support for average attention network is experimental ' | |||
'and may change in future versions.') | |||
self.prenet_units = prenet_units | |||
self.dense_units = dense_units | |||
self.num_mels = num_mels | |||
self.outputs_per_step = outputs_per_step | |||
self.X_band_width = X_band_width | |||
self.H_band_width = H_band_width | |||
@property | |||
def output_size(self): | |||
"""Returns the decoder output size.""" | |||
return self.num_units | |||
@property | |||
def support_alignment_history(self): | |||
return True | |||
@property | |||
def support_multi_source(self): | |||
return True | |||
def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
cache = {} | |||
for layer in range(self.num_layers): | |||
proj_cache_shape = [ | |||
batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
] | |||
layer_cache = {} | |||
layer_cache['memory'] = [{ | |||
'memory_keys': | |||
tf.zeros(proj_cache_shape, dtype=dtype), | |||
'memory_values': | |||
tf.zeros(proj_cache_shape, dtype=dtype) | |||
} for _ in range(num_sources)] | |||
if self.self_attention_type == 'scaled_dot': | |||
layer_cache['self_keys'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
layer_cache['self_values'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
elif self.self_attention_type == 'average': | |||
layer_cache['prev_g'] = tf.zeros( | |||
[batch_size, 1, self.num_units], dtype=dtype) | |||
cache['layer_{}'.format(layer)] = layer_cache | |||
return cache | |||
def _init_attn(self, dtype=tf.float32): | |||
attn = [] | |||
for layer in range(self.num_layers): | |||
attn.append(tf.TensorArray(tf.float32, size=0, dynamic_size=True)) | |||
return attn | |||
def _self_attention_stack(self, | |||
inputs, | |||
sequence_length=None, | |||
mode=True, | |||
cache=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
step=None): | |||
# [N, T_out, self.dense_units] or [N, 1, self.dense_units] | |||
prenet_outputs = decoder_prenet(inputs, self.prenet_units, | |||
self.dense_units, mode) | |||
if step is None: | |||
decoder_inputs = tf.concat( | |||
[memory, prenet_outputs], | |||
axis=-1) # [N, T_out, memory_size + self.dense_units] | |||
else: | |||
decoder_inputs = tf.concat( | |||
[memory[:, step:step + 1, :], prenet_outputs], | |||
axis=-1) # [N, 1, memory_size + self.dense_units] | |||
decoder_inputs = tf.layers.dense( | |||
decoder_inputs, units=self.dense_units) | |||
inputs = decoder_inputs | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder( | |||
inputs, position=step + 1 if step is not None else None) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
decoder_mask = None | |||
memory_mask = None | |||
# last_attention = None | |||
X_band_width_tmp = -1 | |||
H_band_width_tmp = -1 | |||
if self.X_band_width is not None: | |||
X_band_width_tmp = tf.cast( | |||
tf.cond( | |||
tf.less(tf.shape(memory)[1], self.X_band_width), | |||
lambda: -1, lambda: self.X_band_width), | |||
dtype=tf.int64) | |||
if self.H_band_width is not None: | |||
H_band_width_tmp = tf.cast( | |||
tf.cond( | |||
tf.less(tf.shape(memory)[1], self.H_band_width), | |||
lambda: -1, lambda: self.H_band_width), | |||
dtype=tf.int64) | |||
if self.self_attention_type == 'scaled_dot': | |||
if sequence_length is not None: | |||
decoder_mask = transformer.build_future_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1], | |||
band=X_band_width_tmp) # [N, 1, T_out, T_out] | |||
elif self.self_attention_type == 'average': | |||
if cache is None: | |||
if sequence_length is None: | |||
sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
tf.shape(inputs)[1]) | |||
decoder_mask = transformer.cumulative_average_mask( | |||
sequence_length, | |||
maximum_length=tf.shape(inputs)[1], | |||
dtype=inputs.dtype) | |||
if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
memory): | |||
memory = (memory, ) | |||
if memory_sequence_length is not None: | |||
if not tf.contrib.framework.nest.is_sequence( | |||
memory_sequence_length): | |||
memory_sequence_length = (memory_sequence_length, ) | |||
if step is None: | |||
memory_mask = [ | |||
transformer.build_history_mask( | |||
length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(m)[1], | |||
band=H_band_width_tmp) | |||
for m, length in zip(memory, memory_sequence_length) | |||
] | |||
else: | |||
memory_mask = [ | |||
transformer.build_history_mask( | |||
length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(m)[1], | |||
band=H_band_width_tmp)[:, :, step:step + 1, :] | |||
for m, length in zip(memory, memory_sequence_length) | |||
] | |||
# last_attention = None | |||
attns_x = [] | |||
attns_h = [] | |||
for layer in range(self.num_layers): | |||
layer_name = 'layer_{}'.format(layer) | |||
layer_cache = cache[layer_name] if cache is not None else None | |||
with tf.variable_scope(layer_name): | |||
if memory is not None: | |||
for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
memory_cache = None | |||
if layer_cache is not None: | |||
memory_cache = layer_cache['memory'][i] | |||
scope_name = 'multi_head_{}'.format(i) | |||
if i == 0: | |||
scope_name = 'multi_head' | |||
with tf.variable_scope(scope_name): | |||
encoded, attn_x, attn_h = transformer.multi_head_attention_PNCA( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
mem, | |||
mode, | |||
num_units=self.num_units, | |||
mask=decoder_mask, | |||
mask_h=mask, | |||
cache=layer_cache, | |||
cache_h=memory_cache, | |||
dropout=self.attention_dropout, | |||
return_attention=True, | |||
layer_name=layer_name, | |||
X_band_width=self.X_band_width) | |||
attns_x.append(attn_x) | |||
attns_h.append(attn_h) | |||
context = transformer.drop_and_add( | |||
inputs, encoded, mode, dropout=self.dropout) | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward_ori( | |||
transformer.norm(context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
transformed = transformer.drop_and_add( | |||
context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
outputs = transformer.norm(inputs) | |||
outputs = tf.layers.dense( | |||
outputs, units=self.num_mels * self.outputs_per_step) | |||
return outputs, attns_x, attns_h | |||
def decode_from_inputs(self, | |||
inputs, | |||
sequence_length, | |||
initial_state=None, | |||
mode=True, | |||
memory=None, | |||
memory_sequence_length=None): | |||
outputs, attention_x, attention_h = self._self_attention_stack( | |||
inputs, | |||
sequence_length=sequence_length, | |||
mode=mode, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
return outputs, attention_x, attention_h | |||
def step_fn(self, | |||
mode, | |||
batch_size, | |||
initial_state=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
dtype=tf.float32): | |||
if memory is None: | |||
num_sources = 0 | |||
elif tf.contrib.framework.nest.is_sequence(memory): | |||
num_sources = len(memory) | |||
else: | |||
num_sources = 1 | |||
cache = self._init_cache( | |||
batch_size, dtype=dtype, num_sources=num_sources) | |||
attention_x = self._init_attn(dtype=dtype) | |||
attention_h = self._init_attn(dtype=dtype) | |||
def _fn(step, inputs, cache): | |||
outputs, attention_x, attention_h = self._self_attention_stack( | |||
inputs, | |||
mode=mode, | |||
cache=cache, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length, | |||
step=step) | |||
attention_x_tmp = [] | |||
for layer in range(len(attention_h)): | |||
attention_x_tmp_l = tf.zeros_like(attention_h[layer]) | |||
if self.X_band_width is not None: | |||
pred = tf.less(step, self.X_band_width + 1) | |||
attention_x_tmp_l_1 = tf.cond(pred, # yapf:disable | |||
lambda: attention_x_tmp_l[:, :, :, :step + 1] + attention_x[layer], | |||
lambda: tf.concat([ | |||
attention_x_tmp_l[:, :, :, | |||
:step - self.X_band_width], | |||
attention_x_tmp_l[:, :, :, | |||
step - self.X_band_width:step + 1] | |||
+ attention_x[layer]], | |||
axis=-1)) # yapf:disable | |||
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
attention_x_tmp.append( | |||
tf.concat([attention_x_tmp_l_1, attention_x_tmp_l_2], | |||
axis=-1)) | |||
else: | |||
attention_x_tmp_l_1 = attention_x_tmp_l[:, :, :, :step + 1] | |||
attention_x_tmp_l_2 = attention_x_tmp_l[:, :, :, step + 1:] | |||
attention_x_tmp.append( | |||
tf.concat([ | |||
attention_x_tmp_l_1 + attention_x[layer], | |||
attention_x_tmp_l_2 | |||
], axis=-1)) # yapf:disable | |||
attention_x = attention_x_tmp | |||
return outputs, cache, attention_x, attention_h | |||
return _fn, cache, attention_x, attention_h | |||
def dynamic_decode_and_search(self, init_decoder_input, maximum_iterations, | |||
mode, memory, memory_sequence_length): | |||
batch_size = tf.shape(init_decoder_input)[0] | |||
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
mode, | |||
batch_size, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
outputs, attention_x, attention_h, cache = self.dynamic_decode( | |||
step_fn, | |||
init_decoder_input, | |||
init_cache=init_cache, | |||
init_attn_x=init_attn_x, | |||
init_attn_h=init_attn_h, | |||
maximum_iterations=maximum_iterations, | |||
batch_size=batch_size) | |||
return outputs, attention_x, attention_h | |||
def dynamic_decode_and_search_teacher_forcing(self, decoder_input, | |||
maximum_iterations, mode, | |||
memory, | |||
memory_sequence_length): | |||
batch_size = tf.shape(decoder_input)[0] | |||
step_fn, init_cache, init_attn_x, init_attn_h = self.step_fn( | |||
mode, | |||
batch_size, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
outputs, attention_x, attention_h, cache = self.dynamic_decode_teacher_forcing( | |||
step_fn, | |||
decoder_input, | |||
init_cache=init_cache, | |||
init_attn_x=init_attn_x, | |||
init_attn_h=init_attn_h, | |||
maximum_iterations=maximum_iterations, | |||
batch_size=batch_size) | |||
return outputs, attention_x, attention_h | |||
def dynamic_decode(self, | |||
step_fn, | |||
init_decoder_input, | |||
init_cache=None, | |||
init_attn_x=None, | |||
init_attn_h=None, | |||
maximum_iterations=None, | |||
batch_size=None): | |||
def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
return tf.less(step, maximum_iterations) | |||
def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
# output: [1, 1, num_mels * r] | |||
# attn: [1, 1, T_out] | |||
output, cache, attn_x, attn_h = step_fn( | |||
step, inputs, cache) # outputs, cache, attention, attns | |||
for layer in range(len(attention_x)): | |||
attention_x[layer] = attention_x[layer].write( | |||
step, tf.cast(attn_x[layer], tf.float32)) | |||
for layer in range(len(attention_h)): | |||
attention_h[layer] = attention_h[layer].write( | |||
step, tf.cast(attn_h[layer], tf.float32)) | |||
outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
return step + 1, cache, output[:, :, -self. | |||
num_mels:], outputs, attention_x, attention_h | |||
step = tf.constant(0, dtype=tf.int32) | |||
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
_, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
_cond, | |||
_body, | |||
loop_vars=(step, init_cache, init_decoder_input, outputs, | |||
init_attn_x, init_attn_h), | |||
shape_invariants=(step.shape, | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_cache), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, | |||
init_decoder_input), tf.TensorShape(None), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_x), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_h)), | |||
parallel_iterations=1, | |||
back_prop=False, | |||
maximum_iterations=maximum_iterations) | |||
# element of outputs: [N, 1, num_mels * r] | |||
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
outputs_stack = tf.transpose( | |||
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
outputs_stack = tf.squeeze( | |||
outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
attention_x_stack = [] | |||
for layer in range(len(attention_x)): | |||
attention_x_stack_tmp = attention_x[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_x_stack_tmp = tf.transpose( | |||
attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_x_stack_tmp = tf.squeeze( | |||
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_x_stack.append(attention_x_stack_tmp) | |||
attention_h_stack = [] | |||
for layer in range(len(attention_h)): | |||
attention_h_stack_tmp = attention_h[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_h_stack_tmp = tf.transpose( | |||
attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_h_stack_tmp = tf.squeeze( | |||
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_h_stack.append(attention_h_stack_tmp) | |||
return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
def dynamic_decode_teacher_forcing(self, | |||
step_fn, | |||
decoder_input, | |||
init_cache=None, | |||
init_attn_x=None, | |||
init_attn_h=None, | |||
maximum_iterations=None, | |||
batch_size=None): | |||
def _cond(step, cache, inputs, outputs, attention_x, attention_h): # pylint: disable=unused-argument | |||
return tf.less(step, maximum_iterations) | |||
def _body(step, cache, inputs, outputs, attention_x, attention_h): | |||
# output: [1, 1, num_mels * r] | |||
# attn: [1, 1, T_out] | |||
output, cache, attn_x, attn_h = step_fn( | |||
step, inputs[:, step:step + 1, :], | |||
cache) # outputs, cache, attention, attns | |||
for layer in range(len(attention_x)): | |||
attention_x[layer] = attention_x[layer].write( | |||
step, tf.cast(attn_x[layer], tf.float32)) | |||
for layer in range(len(attention_h)): | |||
attention_h[layer] = attention_h[layer].write( | |||
step, tf.cast(attn_h[layer], tf.float32)) | |||
outputs = outputs.write(step, tf.cast(output, tf.float32)) | |||
return step + 1, cache, inputs, outputs, attention_x, attention_h | |||
step = tf.constant(0, dtype=tf.int32) | |||
outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True) | |||
_, cache, _, outputs, attention_x, attention_h = tf.while_loop( | |||
_cond, | |||
_body, | |||
loop_vars=(step, init_cache, decoder_input, outputs, init_attn_x, | |||
init_attn_h), | |||
shape_invariants=(step.shape, | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, | |||
init_cache), decoder_input.shape, | |||
tf.TensorShape(None), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_x), | |||
compat.nest.map_structure( | |||
self._get_shape_invariants, init_attn_h)), | |||
parallel_iterations=1, | |||
back_prop=False, | |||
maximum_iterations=maximum_iterations) | |||
# element of outputs: [N, 1, num_mels * r] | |||
outputs_stack = outputs.stack() # [T_out, N, 1, num_mels * r] | |||
outputs_stack = tf.transpose( | |||
outputs_stack, perm=[2, 1, 0, 3]) # [1, N, T_out, num_mels * r] | |||
outputs_stack = tf.squeeze( | |||
outputs_stack, axis=0) # [N, T_out, num_mels * r] | |||
attention_x_stack = [] | |||
for layer in range(len(attention_x)): | |||
attention_x_stack_tmp = attention_x[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_x_stack_tmp = tf.transpose( | |||
attention_x_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_x_stack_tmp = tf.squeeze( | |||
attention_x_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_x_stack.append(attention_x_stack_tmp) | |||
attention_h_stack = [] | |||
for layer in range(len(attention_h)): | |||
attention_h_stack_tmp = attention_h[layer].stack( | |||
) # [T_out, N, H, 1, T_out] | |||
attention_h_stack_tmp = tf.transpose( | |||
attention_h_stack_tmp, perm=[3, 1, 2, 0, | |||
4]) # [1, N, H, T_out, T_out] | |||
attention_h_stack_tmp = tf.squeeze( | |||
attention_h_stack_tmp, axis=0) # [N, H, T_out, T_out] | |||
attention_h_stack.append(attention_h_stack_tmp) | |||
return outputs_stack, attention_x_stack, attention_h_stack, cache | |||
def _get_shape_invariants(self, tensor): | |||
"""Returns the shape of the tensor but sets middle dims to None.""" | |||
if isinstance(tensor, tf.TensorArray): | |||
shape = None | |||
else: | |||
shape = tensor.shape.as_list() | |||
for i in range(1, len(shape) - 1): | |||
shape[i] = None | |||
return tf.TensorShape(shape) | |||
class SelfAttentionDecoderOri(): | |||
"""Decoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
position_encoder=SinusoidalPositionEncoder(), | |||
self_attention_type='scaled_dot'): | |||
"""Initializes the parameters of the decoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: A :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
self_attention_type: Type of self attention, "scaled_dot" or "average" (case | |||
insensitive). | |||
Raises: | |||
ValueError: if :obj:`self_attention_type` is invalid. | |||
""" | |||
super(SelfAttentionDecoderOri, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
self.self_attention_type = self_attention_type.lower() | |||
if self.self_attention_type not in ('scaled_dot', 'average'): | |||
raise ValueError('invalid attention type %s' | |||
% self.self_attention_type) | |||
if self.self_attention_type == 'average': | |||
tf.logging.warning( | |||
'Support for average attention network is experimental ' | |||
'and may change in future versions.') | |||
@property | |||
def output_size(self): | |||
"""Returns the decoder output size.""" | |||
return self.num_units | |||
@property | |||
def support_alignment_history(self): | |||
return True | |||
@property | |||
def support_multi_source(self): | |||
return True | |||
def _init_cache(self, batch_size, dtype=tf.float32, num_sources=1): | |||
cache = {} | |||
for layer in range(self.num_layers): | |||
proj_cache_shape = [ | |||
batch_size, self.num_heads, 0, self.num_units // self.num_heads | |||
] | |||
layer_cache = {} | |||
layer_cache['memory'] = [{ | |||
'memory_keys': | |||
tf.zeros(proj_cache_shape, dtype=dtype), | |||
'memory_values': | |||
tf.zeros(proj_cache_shape, dtype=dtype) | |||
} for _ in range(num_sources)] | |||
if self.self_attention_type == 'scaled_dot': | |||
layer_cache['self_keys'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
layer_cache['self_values'] = tf.zeros( | |||
proj_cache_shape, dtype=dtype) | |||
elif self.self_attention_type == 'average': | |||
layer_cache['prev_g'] = tf.zeros( | |||
[batch_size, 1, self.num_units], dtype=dtype) | |||
cache['layer_{}'.format(layer)] = layer_cache | |||
return cache | |||
def _self_attention_stack(self, | |||
inputs, | |||
sequence_length=None, | |||
mode=True, | |||
cache=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
step=None): | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder( | |||
inputs, position=step + 1 if step is not None else None) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
decoder_mask = None | |||
memory_mask = None | |||
last_attention = None | |||
if self.self_attention_type == 'scaled_dot': | |||
if sequence_length is not None: | |||
decoder_mask = transformer.build_future_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1]) | |||
elif self.self_attention_type == 'average': | |||
if cache is None: | |||
if sequence_length is None: | |||
sequence_length = tf.fill([tf.shape(inputs)[0]], | |||
tf.shape(inputs)[1]) | |||
decoder_mask = transformer.cumulative_average_mask( | |||
sequence_length, | |||
maximum_length=tf.shape(inputs)[1], | |||
dtype=inputs.dtype) | |||
if memory is not None and not tf.contrib.framework.nest.is_sequence( | |||
memory): | |||
memory = (memory, ) | |||
if memory_sequence_length is not None: | |||
if not tf.contrib.framework.nest.is_sequence( | |||
memory_sequence_length): | |||
memory_sequence_length = (memory_sequence_length, ) | |||
memory_mask = [ | |||
transformer.build_sequence_mask( | |||
length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(m)[1]) | |||
for m, length in zip(memory, memory_sequence_length) | |||
] | |||
for layer in range(self.num_layers): | |||
layer_name = 'layer_{}'.format(layer) | |||
layer_cache = cache[layer_name] if cache is not None else None | |||
with tf.variable_scope(layer_name): | |||
if self.self_attention_type == 'scaled_dot': | |||
with tf.variable_scope('masked_multi_head'): | |||
encoded = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
None, | |||
mode, | |||
num_units=self.num_units, | |||
mask=decoder_mask, | |||
cache=layer_cache, | |||
dropout=self.attention_dropout) | |||
last_context = transformer.drop_and_add( | |||
inputs, encoded, mode, dropout=self.dropout) | |||
elif self.self_attention_type == 'average': | |||
with tf.variable_scope('average_attention'): | |||
# Cumulative average. | |||
x = transformer.norm(inputs) | |||
y = transformer.cumulative_average( | |||
x, | |||
decoder_mask if cache is None else step, | |||
cache=layer_cache) | |||
# FFN. | |||
y = transformer.feed_forward( | |||
y, | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
# Gating layer. | |||
z = tf.layers.dense( | |||
tf.concat([x, y], -1), self.num_units * 2) | |||
i, f = tf.split(z, 2, axis=-1) | |||
y = tf.sigmoid(i) * x + tf.sigmoid(f) * y | |||
last_context = transformer.drop_and_add( | |||
inputs, y, mode, dropout=self.dropout) | |||
if memory is not None: | |||
for i, (mem, mask) in enumerate(zip(memory, memory_mask)): | |||
memory_cache = layer_cache['memory'][i] if layer_cache is not None else None # yapf:disable | |||
with tf.variable_scope('multi_head' if i | |||
== 0 else 'multi_head_%d' % i): # yapf:disable | |||
context, last_attention = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(last_context), | |||
mem, | |||
mode, | |||
mask=mask, | |||
cache=memory_cache, | |||
dropout=self.attention_dropout, | |||
return_attention=True) | |||
last_context = transformer.drop_and_add( | |||
last_context, | |||
context, | |||
mode, | |||
dropout=self.dropout) | |||
if i > 0: # Do not return attention in case of multi source. | |||
last_attention = None | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward_ori( | |||
transformer.norm(last_context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
transformed = transformer.drop_and_add( | |||
last_context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
if last_attention is not None: | |||
# The first head of the last layer is returned. | |||
first_head_attention = last_attention[:, 0] | |||
else: | |||
first_head_attention = None | |||
outputs = transformer.norm(inputs) | |||
return outputs, first_head_attention | |||
def decode_from_inputs(self, | |||
inputs, | |||
sequence_length, | |||
initial_state=None, | |||
mode=True, | |||
memory=None, | |||
memory_sequence_length=None): | |||
outputs, attention = self._self_attention_stack( | |||
inputs, | |||
sequence_length=sequence_length, | |||
mode=mode, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length) | |||
return outputs, None, attention | |||
def step_fn(self, | |||
mode, | |||
batch_size, | |||
initial_state=None, | |||
memory=None, | |||
memory_sequence_length=None, | |||
dtype=tf.float32): | |||
if memory is None: | |||
num_sources = 0 | |||
elif tf.contrib.framework.nest.is_sequence(memory): | |||
num_sources = len(memory) | |||
else: | |||
num_sources = 1 | |||
cache = self._init_cache( | |||
batch_size, dtype=dtype, num_sources=num_sources) | |||
def _fn(step, inputs, cache, mode): | |||
inputs = tf.expand_dims(inputs, 1) | |||
outputs, attention = self._self_attention_stack( | |||
inputs, | |||
mode=mode, | |||
cache=cache, | |||
memory=memory, | |||
memory_sequence_length=memory_sequence_length, | |||
step=step) | |||
outputs = tf.squeeze(outputs, axis=1) | |||
if attention is not None: | |||
attention = tf.squeeze(attention, axis=1) | |||
return outputs, cache, attention | |||
return _fn, cache |
@@ -0,0 +1,182 @@ | |||
"""Define the self-attention encoder.""" | |||
import tensorflow as tf | |||
from . import transformer | |||
from .position import SinusoidalPositionEncoder | |||
class SelfAttentionEncoder(): | |||
"""Encoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
position_encoder=SinusoidalPositionEncoder()): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(SelfAttentionEncoder, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = transformer.build_sequence_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1]) | |||
mask_FF = tf.squeeze( | |||
transformer.build_sequence_mask( | |||
sequence_length, maximum_length=tf.shape(inputs)[1]), | |||
axis=1) | |||
state = () | |||
attns = [] | |||
for layer in range(self.num_layers): | |||
with tf.variable_scope('layer_{}'.format(layer)): | |||
with tf.variable_scope('multi_head'): | |||
context, attn = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
None, | |||
mode, | |||
num_units=self.num_units, | |||
mask=mask, | |||
dropout=self.attention_dropout, | |||
return_attention=True) | |||
attns.append(attn) | |||
context = transformer.drop_and_add( | |||
inputs, context, mode, dropout=self.dropout) | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward( | |||
transformer.norm(context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout, | |||
mask=mask_FF) | |||
transformed = transformer.drop_and_add( | |||
context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = transformer.norm(inputs) | |||
return (outputs, state, sequence_length, attns) | |||
class SelfAttentionEncoderOri(): | |||
"""Encoder using self-attention as described in | |||
https://arxiv.org/abs/1706.03762. | |||
""" | |||
def __init__(self, | |||
num_layers, | |||
num_units=512, | |||
num_heads=8, | |||
ffn_inner_dim=2048, | |||
dropout=0.1, | |||
attention_dropout=0.1, | |||
relu_dropout=0.1, | |||
position_encoder=SinusoidalPositionEncoder()): | |||
"""Initializes the parameters of the encoder. | |||
Args: | |||
num_layers: The number of layers. | |||
num_units: The number of hidden units. | |||
num_heads: The number of heads in the multi-head attention. | |||
ffn_inner_dim: The number of units of the inner linear transformation | |||
in the feed forward layer. | |||
dropout: The probability to drop units from the outputs. | |||
attention_dropout: The probability to drop units from the attention. | |||
relu_dropout: The probability to drop units from the ReLU activation in | |||
the feed forward layer. | |||
position_encoder: The :class:`opennmt.layers.position.PositionEncoder` to | |||
apply on inputs or ``None``. | |||
""" | |||
super(SelfAttentionEncoderOri, self).__init__() | |||
self.num_layers = num_layers | |||
self.num_units = num_units | |||
self.num_heads = num_heads | |||
self.ffn_inner_dim = ffn_inner_dim | |||
self.dropout = dropout | |||
self.attention_dropout = attention_dropout | |||
self.relu_dropout = relu_dropout | |||
self.position_encoder = position_encoder | |||
def encode(self, inputs, sequence_length=None, mode=True): | |||
inputs *= self.num_units**0.5 | |||
if self.position_encoder is not None: | |||
inputs = self.position_encoder(inputs) | |||
inputs = tf.layers.dropout(inputs, rate=self.dropout, training=mode) | |||
mask = transformer.build_sequence_mask( | |||
sequence_length, | |||
num_heads=self.num_heads, | |||
maximum_length=tf.shape(inputs)[1]) # [N, 1, 1, T_out] | |||
state = () | |||
attns = [] | |||
for layer in range(self.num_layers): | |||
with tf.variable_scope('layer_{}'.format(layer)): | |||
with tf.variable_scope('multi_head'): | |||
context, attn = transformer.multi_head_attention( | |||
self.num_heads, | |||
transformer.norm(inputs), | |||
None, | |||
mode, | |||
num_units=self.num_units, | |||
mask=mask, | |||
dropout=self.attention_dropout, | |||
return_attention=True) | |||
attns.append(attn) | |||
context = transformer.drop_and_add( | |||
inputs, context, mode, dropout=self.dropout) | |||
with tf.variable_scope('ffn'): | |||
transformed = transformer.feed_forward_ori( | |||
transformer.norm(context), | |||
self.ffn_inner_dim, | |||
mode, | |||
dropout=self.relu_dropout) | |||
transformed = transformer.drop_and_add( | |||
context, transformed, mode, dropout=self.dropout) | |||
inputs = transformed | |||
state += (tf.reduce_mean(inputs, axis=1), ) | |||
outputs = transformer.norm(inputs) | |||
return (outputs, state, sequence_length, attns) |
@@ -0,0 +1,255 @@ | |||
import io | |||
import os | |||
from typing import Any, Dict, Optional, Union | |||
import numpy as np | |||
import tensorflow as tf | |||
from sklearn.preprocessing import MultiLabelBinarizer | |||
from modelscope.models.base import Model | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .models import create_model | |||
from .text.symbols import load_symbols | |||
from .text.symbols_dict import SymbolsDict | |||
__all__ = ['SambertNetHifi16k'] | |||
def multi_label_symbol_to_sequence(my_classes, my_symbol): | |||
one_hot = MultiLabelBinarizer(my_classes) | |||
tokens = my_symbol.strip().split(' ') | |||
sequences = [] | |||
for token in tokens: | |||
sequences.append(tuple(token.split('&'))) | |||
# sequences.append(tuple(['~'])) # sequence length minus 1 to ignore EOS ~ | |||
return one_hot.fit_transform(sequences) | |||
@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k') | |||
class SambertNetHifi16k(Model): | |||
def __init__(self, | |||
model_dir, | |||
pitch_control_str='', | |||
duration_control_str='', | |||
energy_control_str='', | |||
*args, | |||
**kwargs): | |||
tf.reset_default_graph() | |||
local_ckpt_path = os.path.join(ModelFile.TF_CHECKPOINT_FOLDER, 'ckpt') | |||
self._ckpt_path = os.path.join(model_dir, local_ckpt_path) | |||
self._dict_path = os.path.join(model_dir, 'dicts') | |||
self._hparams = tf.contrib.training.HParams(**kwargs) | |||
values = self._hparams.values() | |||
hp = [' {}:{}'.format(name, values[name]) for name in sorted(values)] | |||
print('Hyperparameters:\n' + '\n'.join(hp)) | |||
super().__init__(self._ckpt_path, *args, **kwargs) | |||
model_name = 'robutrans' | |||
self._lfeat_type_list = self._hparams.lfeat_type_list.strip().split( | |||
',') | |||
sy, tone, syllable_flag, word_segment, emo_category, speaker = load_symbols( | |||
self._dict_path) | |||
self._sy = sy | |||
self._tone = tone | |||
self._syllable_flag = syllable_flag | |||
self._word_segment = word_segment | |||
self._emo_category = emo_category | |||
self._speaker = speaker | |||
self._inputs_dim = dict() | |||
for lfeat_type in self._lfeat_type_list: | |||
if lfeat_type == 'sy': | |||
self._inputs_dim[lfeat_type] = len(sy) | |||
elif lfeat_type == 'tone': | |||
self._inputs_dim[lfeat_type] = len(tone) | |||
elif lfeat_type == 'syllable_flag': | |||
self._inputs_dim[lfeat_type] = len(syllable_flag) | |||
elif lfeat_type == 'word_segment': | |||
self._inputs_dim[lfeat_type] = len(word_segment) | |||
elif lfeat_type == 'emo_category': | |||
self._inputs_dim[lfeat_type] = len(emo_category) | |||
elif lfeat_type == 'speaker': | |||
self._inputs_dim[lfeat_type] = len(speaker) | |||
self._symbols_dict = SymbolsDict(sy, tone, syllable_flag, word_segment, | |||
emo_category, speaker, | |||
self._inputs_dim, | |||
self._lfeat_type_list) | |||
dim_inputs = sum(self._inputs_dim.values( | |||
)) - self._inputs_dim['speaker'] - self._inputs_dim['emo_category'] | |||
inputs = tf.placeholder(tf.float32, [1, None, dim_inputs], 'inputs') | |||
inputs_emotion = tf.placeholder( | |||
tf.float32, [1, None, self._inputs_dim['emo_category']], | |||
'inputs_emotion') | |||
inputs_speaker = tf.placeholder(tf.float32, | |||
[1, None, self._inputs_dim['speaker']], | |||
'inputs_speaker') | |||
input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') | |||
pitch_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
'pitch_contours_scale') | |||
energy_contours_scale = tf.placeholder(tf.float32, [1, None], | |||
'energy_contours_scale') | |||
duration_scale = tf.placeholder(tf.float32, [1, None], | |||
'duration_scale') | |||
with tf.variable_scope('model') as _: | |||
self._model = create_model(model_name, self._hparams) | |||
self._model.initialize( | |||
inputs, | |||
inputs_emotion, | |||
inputs_speaker, | |||
input_lengths, | |||
duration_scales=duration_scale, | |||
pitch_scales=pitch_contours_scale, | |||
energy_scales=energy_contours_scale) | |||
self._mel_spec = self._model.mel_outputs[0] | |||
self._duration_outputs = self._model.duration_outputs[0] | |||
self._duration_outputs_ = self._model.duration_outputs_[0] | |||
self._pitch_contour_outputs = self._model.pitch_contour_outputs[0] | |||
self._energy_contour_outputs = self._model.energy_contour_outputs[ | |||
0] | |||
self._embedded_inputs_emotion = self._model.embedded_inputs_emotion[ | |||
0] | |||
self._embedding_fsmn_outputs = self._model.embedding_fsmn_outputs[ | |||
0] | |||
self._encoder_outputs = self._model.encoder_outputs[0] | |||
self._pitch_embeddings = self._model.pitch_embeddings[0] | |||
self._energy_embeddings = self._model.energy_embeddings[0] | |||
self._LR_outputs = self._model.LR_outputs[0] | |||
self._postnet_fsmn_outputs = self._model.postnet_fsmn_outputs[0] | |||
self._attention_h = self._model.attention_h | |||
self._attention_x = self._model.attention_x | |||
print('Loading checkpoint: %s' % self._ckpt_path) | |||
config = tf.ConfigProto() | |||
config.gpu_options.allow_growth = True | |||
self._session = tf.Session(config=config) | |||
self._session.run(tf.global_variables_initializer()) | |||
saver = tf.train.Saver() | |||
saver.restore(self._session, self._ckpt_path) | |||
duration_cfg_lst = [] | |||
if len(duration_control_str) != 0: | |||
for item in duration_control_str.strip().split('|'): | |||
percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
duration_cfg_lst.append((float(percent), float(scale))) | |||
self._duration_cfg_lst = duration_cfg_lst | |||
pitch_contours_cfg_lst = [] | |||
if len(pitch_control_str) != 0: | |||
for item in pitch_control_str.strip().split('|'): | |||
percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
pitch_contours_cfg_lst.append( | |||
(float(percent), float(scale))) | |||
self._pitch_contours_cfg_lst = pitch_contours_cfg_lst | |||
energy_contours_cfg_lst = [] | |||
if len(energy_control_str) != 0: | |||
for item in energy_control_str.strip().split('|'): | |||
percent, scale = item.lstrip('(').rstrip(')').split(',') | |||
energy_contours_cfg_lst.append( | |||
(float(percent), float(scale))) | |||
self._energy_contours_cfg_lst = energy_contours_cfg_lst | |||
def forward(self, text): | |||
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')] | |||
lfeat_symbol = text.strip().split(' ') | |||
lfeat_symbol_separate = [''] * int(len(self._lfeat_type_list)) | |||
for this_lfeat_symbol in lfeat_symbol: | |||
this_lfeat_symbol = this_lfeat_symbol.strip('{').strip('}').split( | |||
'$') | |||
if len(this_lfeat_symbol) != len(self._lfeat_type_list): | |||
raise Exception( | |||
'Length of this_lfeat_symbol in training data' | |||
+ ' is not equal to the length of lfeat_type_list, ' | |||
+ str(len(this_lfeat_symbol)) + ' VS. ' | |||
+ str(len(self._lfeat_type_list))) | |||
index = 0 | |||
while index < len(lfeat_symbol_separate): | |||
lfeat_symbol_separate[index] = lfeat_symbol_separate[ | |||
index] + this_lfeat_symbol[index] + ' ' | |||
index = index + 1 | |||
index = 0 | |||
lfeat_type = self._lfeat_type_list[index] | |||
sequence = self._symbols_dict.symbol_to_sequence( | |||
lfeat_symbol_separate[index].strip(), lfeat_type, cleaner_names) | |||
sequence_array = np.asarray( | |||
sequence[:-1], | |||
dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
inputs = np.eye( | |||
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
index = index + 1 | |||
while index < len(self._lfeat_type_list) - 2: | |||
lfeat_type = self._lfeat_type_list[index] | |||
sequence = self._symbols_dict.symbol_to_sequence( | |||
lfeat_symbol_separate[index].strip(), lfeat_type, | |||
cleaner_names) | |||
sequence_array = np.asarray( | |||
sequence[:-1], | |||
dtype=np.int32) # sequence length minus 1 to ignore EOS ~ | |||
inputs_temp = np.eye( | |||
self._inputs_dim[lfeat_type], dtype=np.float32)[sequence_array] | |||
inputs = np.concatenate((inputs, inputs_temp), axis=1) | |||
index = index + 1 | |||
seq = inputs | |||
lfeat_type = 'emo_category' | |||
inputs_emotion = multi_label_symbol_to_sequence( | |||
self._emo_category, lfeat_symbol_separate[index].strip()) | |||
# inputs_emotion = inputs_emotion * 1.5 | |||
index = index + 1 | |||
lfeat_type = 'speaker' | |||
inputs_speaker = multi_label_symbol_to_sequence( | |||
self._speaker, lfeat_symbol_separate[index].strip()) | |||
duration_scale = np.ones((len(seq), ), dtype=np.float32) | |||
start_idx = 0 | |||
for (percent, scale) in self._duration_cfg_lst: | |||
duration_scale[start_idx:start_idx | |||
+ int(percent * len(seq))] = scale | |||
start_idx += int(percent * len(seq)) | |||
pitch_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
start_idx = 0 | |||
for (percent, scale) in self._pitch_contours_cfg_lst: | |||
pitch_contours_scale[start_idx:start_idx | |||
+ int(percent * len(seq))] = scale | |||
start_idx += int(percent * len(seq)) | |||
energy_contours_scale = np.ones((len(seq), ), dtype=np.float32) | |||
start_idx = 0 | |||
for (percent, scale) in self._energy_contours_cfg_lst: | |||
energy_contours_scale[start_idx:start_idx | |||
+ int(percent * len(seq))] = scale | |||
start_idx += int(percent * len(seq)) | |||
feed_dict = { | |||
self._model.inputs: [np.asarray(seq, dtype=np.float32)], | |||
self._model.inputs_emotion: | |||
[np.asarray(inputs_emotion, dtype=np.float32)], | |||
self._model.inputs_speaker: | |||
[np.asarray(inputs_speaker, dtype=np.float32)], | |||
self._model.input_lengths: | |||
np.asarray([len(seq)], dtype=np.int32), | |||
self._model.duration_scales: [duration_scale], | |||
self._model.pitch_scales: [pitch_contours_scale], | |||
self._model.energy_scales: [energy_contours_scale] | |||
} | |||
result = self._session.run([ | |||
self._mel_spec, self._duration_outputs, self._duration_outputs_, | |||
self._pitch_contour_outputs, self._embedded_inputs_emotion, | |||
self._embedding_fsmn_outputs, self._encoder_outputs, | |||
self._pitch_embeddings, self._LR_outputs, | |||
self._postnet_fsmn_outputs, self._energy_contour_outputs, | |||
self._energy_embeddings, self._attention_x, self._attention_h | |||
], feed_dict=feed_dict) # yapf:disable | |||
return result[0] |
@@ -0,0 +1,89 @@ | |||
''' | |||
Cleaners are transformations that run over the input text at both training and eval time. | |||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" | |||
hyperparameter. Some cleaners are English-specific. You'll typically want to use: | |||
1. "english_cleaners" for English text | |||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using | |||
the Unidecode library (https://pypi.python.org/pypi/Unidecode) | |||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update | |||
the symbols in symbols.py to match your data). | |||
''' | |||
import re | |||
from unidecode import unidecode | |||
from .numbers import normalize_numbers | |||
# Regular expression matching whitespace: | |||
_whitespace_re = re.compile(r'\s+') | |||
# List of (regular expression, replacement) pairs for abbreviations: | |||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) | |||
for x in [ | |||
('mrs', 'misess'), | |||
('mr', 'mister'), | |||
('dr', 'doctor'), | |||
('st', 'saint'), | |||
('co', 'company'), | |||
('jr', 'junior'), | |||
('maj', 'major'), | |||
('gen', 'general'), | |||
('drs', 'doctors'), | |||
('rev', 'reverend'), | |||
('lt', 'lieutenant'), | |||
('hon', 'honorable'), | |||
('sgt', 'sergeant'), | |||
('capt', 'captain'), | |||
('esq', 'esquire'), | |||
('ltd', 'limited'), | |||
('col', 'colonel'), | |||
('ft', 'fort'), ]] # yapf:disable | |||
def expand_abbreviations(text): | |||
for regex, replacement in _abbreviations: | |||
text = re.sub(regex, replacement, text) | |||
return text | |||
def expand_numbers(text): | |||
return normalize_numbers(text) | |||
def lowercase(text): | |||
return text.lower() | |||
def collapse_whitespace(text): | |||
return re.sub(_whitespace_re, ' ', text) | |||
def convert_to_ascii(text): | |||
return unidecode(text) | |||
def basic_cleaners(text): | |||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.''' | |||
text = lowercase(text) | |||
text = collapse_whitespace(text) | |||
return text | |||
def transliteration_cleaners(text): | |||
'''Pipeline for non-English text that transliterates to ASCII.''' | |||
text = convert_to_ascii(text) | |||
text = lowercase(text) | |||
text = collapse_whitespace(text) | |||
return text | |||
def english_cleaners(text): | |||
'''Pipeline for English text, including number and abbreviation expansion.''' | |||
text = convert_to_ascii(text) | |||
text = lowercase(text) | |||
text = expand_numbers(text) | |||
text = expand_abbreviations(text) | |||
text = collapse_whitespace(text) | |||
return text |
@@ -0,0 +1,64 @@ | |||
import re | |||
valid_symbols = [ | |||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', | |||
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', | |||
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', | |||
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', | |||
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', | |||
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', | |||
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', | |||
'Y', 'Z', 'ZH' | |||
] | |||
_valid_symbol_set = set(valid_symbols) | |||
class CMUDict: | |||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' | |||
def __init__(self, file_or_path, keep_ambiguous=True): | |||
if isinstance(file_or_path, str): | |||
with open(file_or_path, encoding='latin-1') as f: | |||
entries = _parse_cmudict(f) | |||
else: | |||
entries = _parse_cmudict(file_or_path) | |||
if not keep_ambiguous: | |||
entries = { | |||
word: pron | |||
for word, pron in entries.items() if len(pron) == 1 | |||
} | |||
self._entries = entries | |||
def __len__(self): | |||
return len(self._entries) | |||
def lookup(self, word): | |||
'''Returns list of ARPAbet pronunciations of the given word.''' | |||
return self._entries.get(word.upper()) | |||
_alt_re = re.compile(r'\([0-9]+\)') | |||
def _parse_cmudict(file): | |||
cmudict = {} | |||
for line in file: | |||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): | |||
parts = line.split(' ') | |||
word = re.sub(_alt_re, '', parts[0]) | |||
pronunciation = _get_pronunciation(parts[1]) | |||
if pronunciation: | |||
if word in cmudict: | |||
cmudict[word].append(pronunciation) | |||
else: | |||
cmudict[word] = [pronunciation] | |||
return cmudict | |||
def _get_pronunciation(s): | |||
parts = s.strip().split(' ') | |||
for part in parts: | |||
if part not in _valid_symbol_set: | |||
return None | |||
return ' '.join(parts) |
@@ -0,0 +1,70 @@ | |||
import re | |||
import inflect | |||
_inflect = inflect.engine() | |||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') | |||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') | |||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') | |||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') | |||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') | |||
_number_re = re.compile(r'[0-9]+') | |||
def _remove_commas(m): | |||
return m.group(1).replace(',', '') | |||
def _expand_decimal_point(m): | |||
return m.group(1).replace('.', ' point ') | |||
def _expand_dollars(m): | |||
match = m.group(1) | |||
parts = match.split('.') | |||
if len(parts) > 2: | |||
return match + ' dollars' # Unexpected format | |||
dollars = int(parts[0]) if parts[0] else 0 | |||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 | |||
if dollars and cents: | |||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
cent_unit = 'cent' if cents == 1 else 'cents' | |||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) | |||
elif dollars: | |||
dollar_unit = 'dollar' if dollars == 1 else 'dollars' | |||
return '%s %s' % (dollars, dollar_unit) | |||
elif cents: | |||
cent_unit = 'cent' if cents == 1 else 'cents' | |||
return '%s %s' % (cents, cent_unit) | |||
else: | |||
return 'zero dollars' | |||
def _expand_ordinal(m): | |||
return _inflect.number_to_words(m.group(0)) | |||
def _expand_number(m): | |||
num = int(m.group(0)) | |||
if num > 1000 and num < 3000: | |||
if num == 2000: | |||
return 'two thousand' | |||
elif num > 2000 and num < 2010: | |||
return 'two thousand ' + _inflect.number_to_words(num % 100) | |||
elif num % 100 == 0: | |||
return _inflect.number_to_words(num // 100) + ' hundred' | |||
else: | |||
return _inflect.number_to_words( | |||
num, andword='', zero='oh', group=2).replace(', ', ' ') | |||
else: | |||
return _inflect.number_to_words(num, andword='') | |||
def normalize_numbers(text): | |||
text = re.sub(_comma_number_re, _remove_commas, text) | |||
text = re.sub(_pounds_re, r'\1 pounds', text) | |||
text = re.sub(_dollars_re, _expand_dollars, text) | |||
text = re.sub(_decimal_number_re, _expand_decimal_point, text) | |||
text = re.sub(_ordinal_re, _expand_ordinal, text) | |||
text = re.sub(_number_re, _expand_number, text) | |||
return text |
@@ -0,0 +1,95 @@ | |||
''' | |||
Defines the set of symbols used in text input to the model. | |||
The default is a set of ASCII characters that works well for English or text that has been run | |||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. | |||
''' | |||
import codecs | |||
import os | |||
_pad = '_' | |||
_eos = '~' | |||
_mask = '@[MASK]' | |||
def load_symbols(dict_path): | |||
_characters = '' | |||
_ch_symbols = [] | |||
sy_dict_name = 'sy_dict.txt' | |||
sy_dict_path = os.path.join(dict_path, sy_dict_name) | |||
f = codecs.open(sy_dict_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_symbols.append(line) | |||
_arpabet = ['@' + s for s in _ch_symbols] | |||
# Export all symbols: | |||
sy = list(_characters) + _arpabet + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_tones = [] | |||
tone_dict_name = 'tone_dict.txt' | |||
tone_dict_path = os.path.join(dict_path, tone_dict_name) | |||
f = codecs.open(tone_dict_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_tones.append(line) | |||
# Export all tones: | |||
tone = list(_characters) + _ch_tones + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_syllable_flags = [] | |||
syllable_flag_name = 'syllable_flag_dict.txt' | |||
syllable_flag_path = os.path.join(dict_path, syllable_flag_name) | |||
f = codecs.open(syllable_flag_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_syllable_flags.append(line) | |||
# Export all syllable_flags: | |||
syllable_flag = list(_characters) + _ch_syllable_flags + [ | |||
_pad, _eos, _mask | |||
] | |||
_characters = '' | |||
_ch_word_segments = [] | |||
word_segment_name = 'word_segment_dict.txt' | |||
word_segment_path = os.path.join(dict_path, word_segment_name) | |||
f = codecs.open(word_segment_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_word_segments.append(line) | |||
# Export all syllable_flags: | |||
word_segment = list(_characters) + _ch_word_segments + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_emo_types = [] | |||
emo_category_name = 'emo_category_dict.txt' | |||
emo_category_path = os.path.join(dict_path, emo_category_name) | |||
f = codecs.open(emo_category_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_emo_types.append(line) | |||
emo_category = list(_characters) + _ch_emo_types + [_pad, _eos, _mask] | |||
_characters = '' | |||
_ch_speakers = [] | |||
speaker_name = 'speaker_dict.txt' | |||
speaker_path = os.path.join(dict_path, speaker_name) | |||
f = codecs.open(speaker_path, 'r') | |||
for line in f: | |||
line = line.strip('\r\n') | |||
_ch_speakers.append(line) | |||
# Export all syllable_flags: | |||
speaker = list(_characters) + _ch_speakers + [_pad, _eos, _mask] | |||
return sy, tone, syllable_flag, word_segment, emo_category, speaker |
@@ -0,0 +1,200 @@ | |||
import re | |||
import sys | |||
from .cleaners import (basic_cleaners, english_cleaners, | |||
transliteration_cleaners) | |||
class SymbolsDict: | |||
def __init__(self, sy, tone, syllable_flag, word_segment, emo_category, | |||
speaker, inputs_dim, lfeat_type_list): | |||
self._inputs_dim = inputs_dim | |||
self._lfeat_type_list = lfeat_type_list | |||
self._sy_to_id = {s: i for i, s in enumerate(sy)} | |||
self._id_to_sy = {i: s for i, s in enumerate(sy)} | |||
self._tone_to_id = {s: i for i, s in enumerate(tone)} | |||
self._id_to_tone = {i: s for i, s in enumerate(tone)} | |||
self._syllable_flag_to_id = {s: i for i, s in enumerate(syllable_flag)} | |||
self._id_to_syllable_flag = {i: s for i, s in enumerate(syllable_flag)} | |||
self._word_segment_to_id = {s: i for i, s in enumerate(word_segment)} | |||
self._id_to_word_segment = {i: s for i, s in enumerate(word_segment)} | |||
self._emo_category_to_id = {s: i for i, s in enumerate(emo_category)} | |||
self._id_to_emo_category = {i: s for i, s in enumerate(emo_category)} | |||
self._speaker_to_id = {s: i for i, s in enumerate(speaker)} | |||
self._id_to_speaker = {i: s for i, s in enumerate(speaker)} | |||
print('_sy_to_id: ') | |||
print(self._sy_to_id) | |||
print('_tone_to_id: ') | |||
print(self._tone_to_id) | |||
print('_syllable_flag_to_id: ') | |||
print(self._syllable_flag_to_id) | |||
print('_word_segment_to_id: ') | |||
print(self._word_segment_to_id) | |||
print('_emo_category_to_id: ') | |||
print(self._emo_category_to_id) | |||
print('_speaker_to_id: ') | |||
print(self._speaker_to_id) | |||
self._curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |||
self._cleaners = { | |||
basic_cleaners.__name__: basic_cleaners, | |||
transliteration_cleaners.__name__: transliteration_cleaners, | |||
english_cleaners.__name__: english_cleaners | |||
} | |||
def _clean_text(self, text, cleaner_names): | |||
for name in cleaner_names: | |||
cleaner = self._cleaners.get(name) | |||
if not cleaner: | |||
raise Exception('Unknown cleaner: %s' % name) | |||
text = cleaner(text) | |||
return text | |||
def _sy_to_sequence(self, sy): | |||
return [self._sy_to_id[s] for s in sy if self._should_keep_sy(s)] | |||
def _arpabet_to_sequence(self, text): | |||
return self._sy_to_sequence(['@' + s for s in text.split()]) | |||
def _should_keep_sy(self, s): | |||
return s in self._sy_to_id and s != '_' and s != '~' | |||
def symbol_to_sequence(self, this_lfeat_symbol, lfeat_type, cleaner_names): | |||
sequence = [] | |||
if lfeat_type == 'sy': | |||
this_lfeat_symbol = this_lfeat_symbol.strip().split(' ') | |||
this_lfeat_symbol_format = '' | |||
index = 0 | |||
while index < len(this_lfeat_symbol): | |||
this_lfeat_symbol_format = this_lfeat_symbol_format + '{' + this_lfeat_symbol[ | |||
index] + '}' + ' ' | |||
index = index + 1 | |||
sequence = self.text_to_sequence(this_lfeat_symbol_format, | |||
cleaner_names) | |||
elif lfeat_type == 'tone': | |||
sequence = self.tone_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'syllable_flag': | |||
sequence = self.syllable_flag_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'word_segment': | |||
sequence = self.word_segment_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'emo_category': | |||
sequence = self.emo_category_to_sequence(this_lfeat_symbol) | |||
elif lfeat_type == 'speaker': | |||
sequence = self.speaker_to_sequence(this_lfeat_symbol) | |||
else: | |||
raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
return sequence | |||
def text_to_sequence(self, text, cleaner_names): | |||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded | |||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." | |||
Args: | |||
text: string to convert to a sequence | |||
cleaner_names: names of the cleaner functions to run the text through | |||
Returns: | |||
List of integers corresponding to the symbols in the text | |||
''' | |||
sequence = [] | |||
# Check for curly braces and treat their contents as ARPAbet: | |||
while len(text): | |||
m = self._curly_re.match(text) | |||
if not m: | |||
sequence += self._sy_to_sequence( | |||
self._clean_text(text, cleaner_names)) | |||
break | |||
sequence += self._sy_to_sequence( | |||
self._clean_text(m.group(1), cleaner_names)) | |||
sequence += self._arpabet_to_sequence(m.group(2)) | |||
text = m.group(3) | |||
# Append EOS token | |||
sequence.append(self._sy_to_id['~']) | |||
return sequence | |||
def tone_to_sequence(self, tone): | |||
tones = tone.strip().split(' ') | |||
sequence = [] | |||
for this_tone in tones: | |||
sequence.append(self._tone_to_id[this_tone]) | |||
sequence.append(self._tone_to_id['~']) | |||
return sequence | |||
def syllable_flag_to_sequence(self, syllable_flag): | |||
syllable_flags = syllable_flag.strip().split(' ') | |||
sequence = [] | |||
for this_syllable_flag in syllable_flags: | |||
sequence.append(self._syllable_flag_to_id[this_syllable_flag]) | |||
sequence.append(self._syllable_flag_to_id['~']) | |||
return sequence | |||
def word_segment_to_sequence(self, word_segment): | |||
word_segments = word_segment.strip().split(' ') | |||
sequence = [] | |||
for this_word_segment in word_segments: | |||
sequence.append(self._word_segment_to_id[this_word_segment]) | |||
sequence.append(self._word_segment_to_id['~']) | |||
return sequence | |||
def emo_category_to_sequence(self, emo_type): | |||
emo_categories = emo_type.strip().split(' ') | |||
sequence = [] | |||
for this_category in emo_categories: | |||
sequence.append(self._emo_category_to_id[this_category]) | |||
sequence.append(self._emo_category_to_id['~']) | |||
return sequence | |||
def speaker_to_sequence(self, speaker): | |||
speakers = speaker.strip().split(' ') | |||
sequence = [] | |||
for this_speaker in speakers: | |||
sequence.append(self._speaker_to_id[this_speaker]) | |||
sequence.append(self._speaker_to_id['~']) | |||
return sequence | |||
def sequence_to_symbol(self, sequence): | |||
result = '' | |||
pre_lfeat_dim = 0 | |||
for lfeat_type in self._lfeat_type_list: | |||
current_one_hot_sequence = sequence[:, pre_lfeat_dim:pre_lfeat_dim | |||
+ self._inputs_dim[lfeat_type]] | |||
current_sequence = current_one_hot_sequence.argmax(1) | |||
length = current_sequence.shape[0] | |||
index = 0 | |||
while index < length: | |||
this_sequence = current_sequence[index] | |||
s = '' | |||
if lfeat_type == 'sy': | |||
s = self._id_to_sy[this_sequence] | |||
if len(s) > 1 and s[0] == '@': | |||
s = s[1:] | |||
elif lfeat_type == 'tone': | |||
s = self._id_to_tone[this_sequence] | |||
elif lfeat_type == 'syllable_flag': | |||
s = self._id_to_syllable_flag[this_sequence] | |||
elif lfeat_type == 'word_segment': | |||
s = self._id_to_word_segment[this_sequence] | |||
elif lfeat_type == 'emo_category': | |||
s = self._id_to_emo_category[this_sequence] | |||
elif lfeat_type == 'speaker': | |||
s = self._id_to_speaker[this_sequence] | |||
else: | |||
raise Exception('Unknown lfeat type: %s' % lfeat_type) | |||
if index == 0: | |||
result = result + lfeat_type + ': ' | |||
result = result + '{' + s + '}' | |||
if index == length - 1: | |||
result = result + '; ' | |||
index = index + 1 | |||
pre_lfeat_dim = pre_lfeat_dim + self._inputs_dim[lfeat_type] | |||
return result |
@@ -0,0 +1 @@ | |||
from .generic_text_to_speech_frontend import * # noqa F403 |
@@ -0,0 +1,39 @@ | |||
import os | |||
import zipfile | |||
from typing import Any, Dict, List | |||
from modelscope.models.base import Model | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.audio.tts_exceptions import ( | |||
TtsFrontendInitializeFailedException, | |||
TtsFrontendLanguageTypeInvalidException) | |||
from modelscope.utils.constant import Tasks | |||
__all__ = ['GenericTtsFrontend'] | |||
@MODELS.register_module( | |||
Tasks.text_to_speech, module_name=r'generic_tts_frontend') | |||
class GenericTtsFrontend(Model): | |||
def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs): | |||
super().__init__(model_dir, *args, **kwargs) | |||
import ttsfrd | |||
frontend = ttsfrd.TtsFrontendEngine() | |||
zip_file = os.path.join(model_dir, 'resource.zip') | |||
self._res_path = os.path.join(model_dir, 'resource') | |||
with zipfile.ZipFile(zip_file, 'r') as zip_ref: | |||
zip_ref.extractall(model_dir) | |||
if not frontend.initialize(self._res_path): | |||
raise TtsFrontendInitializeFailedException( | |||
'resource invalid: {}'.format(self._res_path)) | |||
if not frontend.set_lang_type(lang_type): | |||
raise TtsFrontendLanguageTypeInvalidException( | |||
'language type invalid: {}, valid is pinyin and chenmix'. | |||
format(lang_type)) | |||
self._frontend = frontend | |||
def forward(self, data: str) -> Dict[str, List]: | |||
result = self._frontend.gen_tacotron_symbols(data) | |||
return {'texts': [s for s in result.splitlines() if s != '']} |
@@ -0,0 +1 @@ | |||
from .hifigan16k import * # noqa F403 |
@@ -0,0 +1,73 @@ | |||
from __future__ import (absolute_import, division, print_function, | |||
unicode_literals) | |||
import argparse | |||
import glob | |||
import os | |||
import time | |||
import json | |||
import numpy as np | |||
import torch | |||
from scipy.io.wavfile import write | |||
from modelscope.models.base import Model | |||
from modelscope.models.builder import MODELS | |||
from modelscope.utils.audio.tts_exceptions import \ | |||
TtsVocoderMelspecShapeMismatchException | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from .models import Generator | |||
__all__ = ['Hifigan16k', 'AttrDict'] | |||
MAX_WAV_VALUE = 32768.0 | |||
def load_checkpoint(filepath, device): | |||
assert os.path.isfile(filepath) | |||
print("Loading '{}'".format(filepath)) | |||
checkpoint_dict = torch.load(filepath, map_location=device) | |||
print('Complete.') | |||
return checkpoint_dict | |||
class AttrDict(dict): | |||
def __init__(self, *args, **kwargs): | |||
super(AttrDict, self).__init__(*args, **kwargs) | |||
self.__dict__ = self | |||
@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k') | |||
class Hifigan16k(Model): | |||
def __init__(self, model_dir, *args, **kwargs): | |||
self._ckpt_path = os.path.join(model_dir, | |||
ModelFile.TORCH_MODEL_BIN_FILE) | |||
self._config = AttrDict(**kwargs) | |||
super().__init__(self._ckpt_path, *args, **kwargs) | |||
if torch.cuda.is_available(): | |||
torch.manual_seed(self._config.seed) | |||
self._device = torch.device('cuda') | |||
else: | |||
self._device = torch.device('cpu') | |||
self._generator = Generator(self._config).to(self._device) | |||
state_dict_g = load_checkpoint(self._ckpt_path, self._device) | |||
self._generator.load_state_dict(state_dict_g['generator']) | |||
self._generator.eval() | |||
self._generator.remove_weight_norm() | |||
def forward(self, melspec): | |||
dim0 = list(melspec.shape)[-1] | |||
if dim0 != 80: | |||
raise TtsVocoderMelspecShapeMismatchException( | |||
'input melspec mismatch 0 dim require 80 but {}'.format(dim0)) | |||
with torch.no_grad(): | |||
x = melspec.T | |||
x = torch.FloatTensor(x).to(self._device) | |||
if len(x.shape) == 2: | |||
x = x.unsqueeze(0) | |||
y_g_hat = self._generator(x) | |||
audio = y_g_hat.squeeze() | |||
audio = audio * MAX_WAV_VALUE | |||
audio = audio.cpu().numpy().astype('int16') | |||
return audio |
@@ -0,0 +1 @@ | |||
from .models import Generator |
@@ -0,0 +1,516 @@ | |||
from distutils.version import LooseVersion | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d | |||
from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm | |||
from .utils import get_padding, init_weights | |||
is_pytorch_17plus = LooseVersion(torch.__version__) >= LooseVersion('1.7') | |||
def stft(x, fft_size, hop_size, win_length, window): | |||
"""Perform STFT and convert to magnitude spectrogram. | |||
Args: | |||
x (Tensor): Input signal tensor (B, T). | |||
fft_size (int): FFT size. | |||
hop_size (int): Hop size. | |||
win_length (int): Window length. | |||
window (str): Window function type. | |||
Returns: | |||
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). | |||
""" | |||
if is_pytorch_17plus: | |||
x_stft = torch.stft( | |||
x, fft_size, hop_size, win_length, window, return_complex=False) | |||
else: | |||
x_stft = torch.stft(x, fft_size, hop_size, win_length, window) | |||
real = x_stft[..., 0] | |||
imag = x_stft[..., 1] | |||
# NOTE(kan-bayashi): clamp is needed to avoid nan or inf | |||
return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) | |||
LRELU_SLOPE = 0.1 | |||
def get_padding_casual(kernel_size, dilation=1): | |||
return int(kernel_size * dilation - dilation) | |||
class Conv1dCasual(torch.nn.Module): | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride=1, | |||
padding=0, | |||
dilation=1, | |||
groups=1, | |||
bias=True, | |||
padding_mode='zeros'): | |||
super(Conv1dCasual, self).__init__() | |||
self.pad = padding | |||
self.conv1d = weight_norm( | |||
Conv1d( | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride, | |||
padding=0, | |||
dilation=dilation, | |||
groups=groups, | |||
bias=bias, | |||
padding_mode=padding_mode)) | |||
self.conv1d.apply(init_weights) | |||
def forward(self, x): # bdt | |||
# described starting from the last dimension and moving forward. | |||
x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), 'constant') | |||
x = self.conv1d(x) | |||
return x | |||
def remove_weight_norm(self): | |||
remove_weight_norm(self.conv1d) | |||
class ConvTranspose1dCausal(torch.nn.Module): | |||
"""CausalConvTranspose1d module with customized initialization.""" | |||
def __init__(self, | |||
in_channels, | |||
out_channels, | |||
kernel_size, | |||
stride, | |||
padding=0): | |||
"""Initialize CausalConvTranspose1d module.""" | |||
super(ConvTranspose1dCausal, self).__init__() | |||
self.deconv = weight_norm( | |||
ConvTranspose1d(in_channels, out_channels, kernel_size, stride)) | |||
self.stride = stride | |||
self.deconv.apply(init_weights) | |||
self.pad = kernel_size - stride | |||
def forward(self, x): | |||
"""Calculate forward propagation. | |||
Args: | |||
x (Tensor): Input tensor (B, in_channels, T_in). | |||
Returns: | |||
Tensor: Output tensor (B, out_channels, T_out). | |||
""" | |||
# x = F.pad(x, (self.pad, 0, 0, 0, 0, 0), "constant") | |||
return self.deconv(x)[:, :, :-self.pad] | |||
def remove_weight_norm(self): | |||
remove_weight_norm(self.deconv) | |||
class ResBlock1(torch.nn.Module): | |||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | |||
super(ResBlock1, self).__init__() | |||
self.h = h | |||
self.convs1 = nn.ModuleList([ | |||
Conv1dCasual( | |||
channels, | |||
channels, | |||
kernel_size, | |||
1, | |||
dilation=dilation[i], | |||
padding=get_padding_casual(kernel_size, dilation[i])) | |||
for i in range(len(dilation)) | |||
]) | |||
self.convs2 = nn.ModuleList([ | |||
Conv1dCasual( | |||
channels, | |||
channels, | |||
kernel_size, | |||
1, | |||
dilation=1, | |||
padding=get_padding_casual(kernel_size, 1)) | |||
for i in range(len(dilation)) | |||
]) | |||
def forward(self, x): | |||
for c1, c2 in zip(self.convs1, self.convs2): | |||
xt = F.leaky_relu(x, LRELU_SLOPE) | |||
xt = c1(xt) | |||
xt = F.leaky_relu(xt, LRELU_SLOPE) | |||
xt = c2(xt) | |||
x = xt + x | |||
return x | |||
def remove_weight_norm(self): | |||
for layer in self.convs1: | |||
layer.remove_weight_norm() | |||
for layer in self.convs2: | |||
layer.remove_weight_norm() | |||
class Generator(torch.nn.Module): | |||
def __init__(self, h): | |||
super(Generator, self).__init__() | |||
self.h = h | |||
self.num_kernels = len(h.resblock_kernel_sizes) | |||
self.num_upsamples = len(h.upsample_rates) | |||
print('num_kernels={}, num_upsamples={}'.format( | |||
self.num_kernels, self.num_upsamples)) | |||
self.conv_pre = Conv1dCasual( | |||
80, h.upsample_initial_channel, 7, 1, padding=7 - 1) | |||
resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | |||
self.ups = nn.ModuleList() | |||
self.repeat_ups = nn.ModuleList() | |||
for i, (u, k) in enumerate( | |||
zip(h.upsample_rates, h.upsample_kernel_sizes)): | |||
upsample = nn.Sequential( | |||
nn.Upsample(mode='nearest', scale_factor=u), | |||
nn.LeakyReLU(LRELU_SLOPE), | |||
Conv1dCasual( | |||
h.upsample_initial_channel // (2**i), | |||
h.upsample_initial_channel // (2**(i + 1)), | |||
kernel_size=7, | |||
stride=1, | |||
padding=7 - 1)) | |||
self.repeat_ups.append(upsample) | |||
self.ups.append( | |||
ConvTranspose1dCausal( | |||
h.upsample_initial_channel // (2**i), | |||
h.upsample_initial_channel // (2**(i + 1)), | |||
k, | |||
u, | |||
padding=(k - u) // 2)) | |||
self.resblocks = nn.ModuleList() | |||
for i in range(len(self.ups)): | |||
ch = h.upsample_initial_channel // (2**(i + 1)) | |||
for j, (k, d) in enumerate( | |||
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | |||
self.resblocks.append(resblock(h, ch, k, d)) | |||
self.conv_post = Conv1dCasual(ch, 1, 7, 1, padding=7 - 1) | |||
def forward(self, x): | |||
x = self.conv_pre(x) | |||
for i in range(self.num_upsamples): | |||
x = torch.sin(x) + x | |||
# transconv | |||
x1 = F.leaky_relu(x, LRELU_SLOPE) | |||
x1 = self.ups[i](x1) | |||
# repeat | |||
x2 = self.repeat_ups[i](x) | |||
x = x1 + x2 | |||
xs = None | |||
for j in range(self.num_kernels): | |||
if xs is None: | |||
xs = self.resblocks[i * self.num_kernels + j](x) | |||
else: | |||
xs += self.resblocks[i * self.num_kernels + j](x) | |||
x = xs / self.num_kernels | |||
x = F.leaky_relu(x) | |||
x = self.conv_post(x) | |||
x = torch.tanh(x) | |||
return x | |||
def remove_weight_norm(self): | |||
print('Removing weight norm...') | |||
for layer in self.ups: | |||
layer.remove_weight_norm() | |||
for layer in self.repeat_ups: | |||
layer[-1].remove_weight_norm() | |||
for layer in self.resblocks: | |||
layer.remove_weight_norm() | |||
self.conv_pre.remove_weight_norm() | |||
self.conv_post.remove_weight_norm() | |||
class DiscriminatorP(torch.nn.Module): | |||
def __init__(self, | |||
period, | |||
kernel_size=5, | |||
stride=3, | |||
use_spectral_norm=False): | |||
super(DiscriminatorP, self).__init__() | |||
self.period = period | |||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
self.convs = nn.ModuleList([ | |||
norm_f( | |||
Conv2d( | |||
1, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
128, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
128, | |||
512, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
512, | |||
1024, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(5, 1), 0))), | |||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), | |||
]) | |||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) | |||
def forward(self, x): | |||
fmap = [] | |||
# 1d to 2d | |||
b, c, t = x.shape | |||
if t % self.period != 0: # pad first | |||
n_pad = self.period - (t % self.period) | |||
x = F.pad(x, (0, n_pad), 'reflect') | |||
t = t + n_pad | |||
x = x.view(b, c, t // self.period, self.period) | |||
for layer in self.convs: | |||
x = layer(x) | |||
x = F.leaky_relu(x, LRELU_SLOPE) | |||
fmap.append(x) | |||
x = self.conv_post(x) | |||
fmap.append(x) | |||
x = torch.flatten(x, 1, -1) | |||
return x, fmap | |||
class MultiPeriodDiscriminator(torch.nn.Module): | |||
def __init__(self): | |||
super(MultiPeriodDiscriminator, self).__init__() | |||
self.discriminators = nn.ModuleList([ | |||
DiscriminatorP(2), | |||
DiscriminatorP(3), | |||
DiscriminatorP(5), | |||
DiscriminatorP(7), | |||
DiscriminatorP(11), | |||
]) | |||
def forward(self, y, y_hat): | |||
y_d_rs = [] | |||
y_d_gs = [] | |||
fmap_rs = [] | |||
fmap_gs = [] | |||
for i, d in enumerate(self.discriminators): | |||
y_d_r, fmap_r = d(y) | |||
y_d_g, fmap_g = d(y_hat) | |||
y_d_rs.append(y_d_r) | |||
fmap_rs.append(fmap_r) | |||
y_d_gs.append(y_d_g) | |||
fmap_gs.append(fmap_g) | |||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
class DiscriminatorS(torch.nn.Module): | |||
def __init__(self, use_spectral_norm=False): | |||
super(DiscriminatorS, self).__init__() | |||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
self.convs = nn.ModuleList([ | |||
norm_f(Conv1d(1, 128, 15, 1, padding=7)), | |||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), | |||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), | |||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), | |||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), | |||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), | |||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), | |||
]) | |||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) | |||
def forward(self, x): | |||
fmap = [] | |||
for layer in self.convs: | |||
x = layer(x) | |||
x = F.leaky_relu(x, LRELU_SLOPE) | |||
fmap.append(x) | |||
x = self.conv_post(x) | |||
fmap.append(x) | |||
x = torch.flatten(x, 1, -1) | |||
return x, fmap | |||
class MultiScaleDiscriminator(torch.nn.Module): | |||
def __init__(self): | |||
super(MultiScaleDiscriminator, self).__init__() | |||
self.discriminators = nn.ModuleList([ | |||
DiscriminatorS(use_spectral_norm=True), | |||
DiscriminatorS(), | |||
DiscriminatorS(), | |||
]) | |||
from pytorch_wavelets import DWT1DForward | |||
self.meanpools = nn.ModuleList( | |||
[DWT1DForward(wave='db3', J=1), | |||
DWT1DForward(wave='db3', J=1)]) | |||
self.convs = nn.ModuleList([ | |||
weight_norm(Conv1d(2, 1, 15, 1, padding=7)), | |||
weight_norm(Conv1d(2, 1, 15, 1, padding=7)) | |||
]) | |||
def forward(self, y, y_hat): | |||
y_d_rs = [] | |||
y_d_gs = [] | |||
fmap_rs = [] | |||
fmap_gs = [] | |||
for i, d in enumerate(self.discriminators): | |||
if i != 0: | |||
yl, yh = self.meanpools[i - 1](y) | |||
y = torch.cat([yl, yh[0]], dim=1) | |||
y = self.convs[i - 1](y) | |||
y = F.leaky_relu(y, LRELU_SLOPE) | |||
yl_hat, yh_hat = self.meanpools[i - 1](y_hat) | |||
y_hat = torch.cat([yl_hat, yh_hat[0]], dim=1) | |||
y_hat = self.convs[i - 1](y_hat) | |||
y_hat = F.leaky_relu(y_hat, LRELU_SLOPE) | |||
y_d_r, fmap_r = d(y) | |||
y_d_g, fmap_g = d(y_hat) | |||
y_d_rs.append(y_d_r) | |||
fmap_rs.append(fmap_r) | |||
y_d_gs.append(y_d_g) | |||
fmap_gs.append(fmap_g) | |||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
class DiscriminatorSTFT(torch.nn.Module): | |||
def __init__(self, | |||
kernel_size=11, | |||
stride=2, | |||
use_spectral_norm=False, | |||
fft_size=1024, | |||
shift_size=120, | |||
win_length=600, | |||
window='hann_window'): | |||
super(DiscriminatorSTFT, self).__init__() | |||
self.fft_size = fft_size | |||
self.shift_size = shift_size | |||
self.win_length = win_length | |||
norm_f = weight_norm if use_spectral_norm is False else spectral_norm | |||
self.convs = nn.ModuleList([ | |||
norm_f( | |||
Conv2d( | |||
fft_size // 2 + 1, | |||
32, (15, 1), (1, 1), | |||
padding=(get_padding(15, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(9, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(9, 1), 0))), | |||
norm_f( | |||
Conv2d( | |||
32, | |||
32, (kernel_size, 1), (stride, 1), | |||
padding=(get_padding(9, 1), 0))), | |||
norm_f(Conv2d(32, 32, (5, 1), (1, 1), padding=(2, 0))), | |||
]) | |||
self.conv_post = norm_f(Conv2d(32, 1, (3, 1), (1, 1), padding=(1, 0))) | |||
self.register_buffer('window', getattr(torch, window)(win_length)) | |||
def forward(self, wav): | |||
wav = torch.squeeze(wav, 1) | |||
x_mag = stft(wav, self.fft_size, self.shift_size, self.win_length, | |||
self.window) | |||
x = torch.transpose(x_mag, 2, 1).unsqueeze(-1) | |||
fmap = [] | |||
for layer in self.convs: | |||
x = layer(x) | |||
x = F.leaky_relu(x, LRELU_SLOPE) | |||
fmap.append(x) | |||
x = self.conv_post(x) | |||
fmap.append(x) | |||
x = x.squeeze(-1) | |||
return x, fmap | |||
class MultiSTFTDiscriminator(torch.nn.Module): | |||
def __init__( | |||
self, | |||
fft_sizes=[1024, 2048, 512], | |||
hop_sizes=[120, 240, 50], | |||
win_lengths=[600, 1200, 240], | |||
window='hann_window', | |||
): | |||
super(MultiSTFTDiscriminator, self).__init__() | |||
self.discriminators = nn.ModuleList() | |||
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): | |||
self.discriminators += [ | |||
DiscriminatorSTFT(fft_size=fs, shift_size=ss, win_length=wl) | |||
] | |||
def forward(self, y, y_hat): | |||
y_d_rs = [] | |||
y_d_gs = [] | |||
fmap_rs = [] | |||
fmap_gs = [] | |||
for i, d in enumerate(self.discriminators): | |||
y_d_r, fmap_r = d(y) | |||
y_d_g, fmap_g = d(y_hat) | |||
y_d_rs.append(y_d_r) | |||
fmap_rs.append(fmap_r) | |||
y_d_gs.append(y_d_g) | |||
fmap_gs.append(fmap_g) | |||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | |||
def feature_loss(fmap_r, fmap_g): | |||
loss = 0 | |||
for dr, dg in zip(fmap_r, fmap_g): | |||
for rl, gl in zip(dr, dg): | |||
loss += torch.mean(torch.abs(rl - gl)) | |||
return loss * 2 | |||
def discriminator_loss(disc_real_outputs, disc_generated_outputs): | |||
loss = 0 | |||
r_losses = [] | |||
g_losses = [] | |||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs): | |||
r_loss = torch.mean((1 - dr)**2) | |||
g_loss = torch.mean(dg**2) | |||
loss += (r_loss + g_loss) | |||
r_losses.append(r_loss.item()) | |||
g_losses.append(g_loss.item()) | |||
return loss, r_losses, g_losses | |||
def generator_loss(disc_outputs): | |||
loss = 0 | |||
gen_losses = [] | |||
for dg in disc_outputs: | |||
temp_loss = torch.mean((1 - dg)**2) | |||
gen_losses.append(temp_loss) | |||
loss += temp_loss | |||
return loss, gen_losses |
@@ -0,0 +1,59 @@ | |||
import glob | |||
import os | |||
import matplotlib | |||
import matplotlib.pylab as plt | |||
import torch | |||
from torch.nn.utils import weight_norm | |||
matplotlib.use('Agg') | |||
def plot_spectrogram(spectrogram): | |||
fig, ax = plt.subplots(figsize=(10, 2)) | |||
im = ax.imshow( | |||
spectrogram, aspect='auto', origin='lower', interpolation='none') | |||
plt.colorbar(im, ax=ax) | |||
fig.canvas.draw() | |||
plt.close() | |||
return fig | |||
def init_weights(m, mean=0.0, std=0.01): | |||
classname = m.__class__.__name__ | |||
if classname.find('Conv') != -1: | |||
m.weight.data.normal_(mean, std) | |||
def apply_weight_norm(m): | |||
classname = m.__class__.__name__ | |||
if classname.find('Conv') != -1: | |||
weight_norm(m) | |||
def get_padding(kernel_size, dilation=1): | |||
return int((kernel_size * dilation - dilation) / 2) | |||
def load_checkpoint(filepath, device): | |||
assert os.path.isfile(filepath) | |||
print("Loading '{}'".format(filepath)) | |||
checkpoint_dict = torch.load(filepath, map_location=device) | |||
print('Complete.') | |||
return checkpoint_dict | |||
def save_checkpoint(filepath, obj): | |||
print('Saving checkpoint to {}'.format(filepath)) | |||
torch.save(obj, filepath) | |||
print('Complete.') | |||
def scan_checkpoint(cp_dir, prefix): | |||
pattern = os.path.join(cp_dir, prefix + '????????') | |||
cp_list = glob.glob(pattern) | |||
if len(cp_list) == 0: | |||
return None | |||
return sorted(cp_list)[-1] |
@@ -62,4 +62,6 @@ class Model(ABC): | |||
if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'): | |||
model_cfg.type = model_cfg.model_type | |||
model_cfg.model_dir = local_model_dir | |||
for k, v in kwargs.items(): | |||
model_cfg.k = v | |||
return build_model(model_cfg, task_name) |
@@ -0,0 +1 @@ | |||
from .image_captioning_model import OfaForImageCaptioning |
@@ -0,0 +1,80 @@ | |||
import os.path as osp | |||
from typing import Any, Dict | |||
from PIL import Image | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from ..base import Model | |||
from ..builder import MODELS | |||
__all__ = ['OfaForImageCaptioning'] | |||
@MODELS.register_module( | |||
Tasks.image_captioning, module_name=r'ofa-image-captioning') | |||
class OfaForImageCaptioning(Model): | |||
def __init__(self, model_dir, *args, **kwargs): | |||
super().__init__(model_dir=model_dir, *args, **kwargs) | |||
ckpt_name = ModelFile.TORCH_MODEL_FILE | |||
local_model = osp.join(model_dir, ckpt_name) | |||
bpe_dir = model_dir | |||
# turn on cuda if GPU is available | |||
from fairseq import checkpoint_utils, tasks, utils | |||
from ofa.tasks.mm_tasks import CaptionTask | |||
from ofa.utils.eval_utils import eval_caption | |||
self.eval_caption = eval_caption | |||
tasks.register_task('caption', CaptionTask) | |||
use_cuda = kwargs['use_cuda'] if 'use_cuda' in kwargs else False | |||
use_fp16 = kwargs[ | |||
'use_fp16'] if 'use_fp16' in kwargs and use_cuda else False | |||
overrides = { | |||
'bpe_dir': bpe_dir, | |||
'eval_cider': False, | |||
'beam': 5, | |||
'max_len_b': 16, | |||
'no_repeat_ngram_size': 3, | |||
'seed': 7 | |||
} | |||
models, cfg, task = checkpoint_utils.load_model_ensemble_and_task( | |||
utils.split_paths(local_model), arg_overrides=overrides) | |||
# Move models to GPU | |||
for model in models: | |||
model.eval() | |||
if use_cuda: | |||
model.cuda() | |||
if use_fp16: | |||
model.half() | |||
model.prepare_for_inference_(cfg) | |||
self.models = models | |||
# Initialize generator | |||
self.generator = task.build_generator(models, cfg.generation) | |||
# Initialize transform | |||
from torchvision import transforms | |||
mean = [0.5, 0.5, 0.5] | |||
std = [0.5, 0.5, 0.5] | |||
self.patch_resize_transform = transforms.Compose([ | |||
lambda image: image.convert('RGB'), | |||
transforms.Resize( | |||
(cfg.task.patch_image_size, cfg.task.patch_image_size), | |||
interpolation=Image.BICUBIC), | |||
transforms.ToTensor(), | |||
transforms.Normalize(mean=mean, std=std), | |||
]) | |||
self.task = task | |||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
results, _ = self.eval_caption(self.task, self.generator, self.models, | |||
input) | |||
return { | |||
'image_id': results[0]['image_id'], | |||
'caption': results[0]['caption'] | |||
} | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
# What should we do here ? | |||
return inputs |
@@ -1,4 +1,5 @@ | |||
from .masked_language_model import * # noqa F403 | |||
from .sentence_similarity_model import * # noqa F403 | |||
from .sequence_classification_model import * # noqa F403 | |||
from .text_generation_model import * # noqa F403 | |||
from .bert_for_sequence_classification import * # noqa F403 | |||
from .palm_for_text_generation import * # noqa F403 | |||
from .sbert_for_sentence_similarity import * # noqa F403 | |||
from .sbert_for_token_classification import * # noqa F403 |
@@ -0,0 +1,43 @@ | |||
from typing import Dict | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Model, Tensor | |||
from ..builder import MODELS | |||
__all__ = ['PalmForTextGeneration'] | |||
@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0') | |||
class PalmForTextGeneration(Model): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
"""initialize the text generation model from the `model_dir` path. | |||
Args: | |||
model_dir (str): the model path. | |||
model_cls (Optional[Any], optional): model loader, if None, use the | |||
default loader to load model weights, by default None. | |||
""" | |||
super().__init__(model_dir, *args, **kwargs) | |||
self.model_dir = model_dir | |||
from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator | |||
model = PalmForConditionalGeneration.from_pretrained(model_dir) | |||
self.tokenizer = model.tokenizer | |||
self.generator = Translator(model) | |||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
"""return the result by the model | |||
Args: | |||
input (Dict[str, Tensor]): the preprocessed data | |||
Returns: | |||
Dict[str, Tensor]: results | |||
Example: | |||
{ | |||
'predictions': Tensor([[1377, 4959, 2785, 6392...])]), # tokens need to be decode by tokenizer | |||
} | |||
""" | |||
return self.generator(**input) |
@@ -0,0 +1,56 @@ | |||
from typing import Any, Dict, Union | |||
import numpy as np | |||
import torch | |||
from sofa import SbertConfig, SbertForTokenClassification | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Model, Tensor | |||
from ..builder import MODELS | |||
__all__ = ['StructBertForTokenClassification'] | |||
@MODELS.register_module( | |||
Tasks.word_segmentation, | |||
module_name=r'structbert-chinese-word-segmentation') | |||
class StructBertForTokenClassification(Model): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
"""initialize the word segmentation model from the `model_dir` path. | |||
Args: | |||
model_dir (str): the model path. | |||
model_cls (Optional[Any], optional): model loader, if None, use the | |||
default loader to load model weights, by default None. | |||
""" | |||
super().__init__(model_dir, *args, **kwargs) | |||
self.model_dir = model_dir | |||
self.model = SbertForTokenClassification.from_pretrained( | |||
self.model_dir) | |||
self.config = SbertConfig.from_pretrained(self.model_dir) | |||
def forward(self, input: Dict[str, | |||
Any]) -> Dict[str, Union[str, np.ndarray]]: | |||
"""return the result by the model | |||
Args: | |||
input (Dict[str, Any]): the preprocessed data | |||
Returns: | |||
Dict[str, Union[str,np.ndarray]]: results | |||
Example: | |||
{ | |||
'predictions': array([1,4]), # lable 0-negative 1-positive | |||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value | |||
'text': str(今天), | |||
} | |||
""" | |||
input_ids = torch.tensor(input['input_ids']).unsqueeze(0) | |||
output = self.model(input_ids) | |||
logits = output.logits | |||
pred = torch.argmax(logits[0], dim=-1) | |||
pred = pred.numpy() | |||
rst = {'predictions': pred, 'logits': logits, 'text': input['text']} | |||
return rst |
@@ -1,52 +0,0 @@ | |||
from typing import Any, Dict | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Model, Tensor | |||
from ..builder import MODELS | |||
__all__ = ['PalmForTextGenerationModel'] | |||
@MODELS.register_module(Tasks.text_generation, module_name=r'palm') | |||
class PalmForTextGenerationModel(Model): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
"""initialize the text generation model from the `model_dir` path. | |||
Args: | |||
model_dir (str): the model path. | |||
model_cls (Optional[Any], optional): model loader, if None, use the | |||
default loader to load model weights, by default None. | |||
""" | |||
from sofa import PalmTokenizer | |||
super().__init__(model_dir, *args, **kwargs) | |||
self.model_dir = model_dir | |||
from sofa.models.palm import PalmForConditionalGeneration, TextGenerator | |||
tokenizer = kwargs.pop('tokenizer', | |||
PalmTokenizer.from_pretrained(model_dir)) | |||
model = PalmForConditionalGeneration.from_pretrained(model_dir) | |||
self.generator = TextGenerator(model, tokenizer) | |||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: | |||
"""return the result by the model | |||
Args: | |||
input (Dict[str, Any]): the preprocessed data | |||
Returns: | |||
Dict[str, np.ndarray]: results | |||
Example: | |||
{ | |||
'predictions': array([1]), # lable 0-negative 1-positive | |||
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), | |||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value | |||
} | |||
""" | |||
encoder_inputs = [ | |||
input['input_ids'], input['token_type_ids'], | |||
input['attention_mask'] | |||
] | |||
return self.generator(encoder_inputs) |
@@ -1,4 +1,4 @@ | |||
from .audio import * # noqa F403 | |||
from .audio import LinearAECPipeline | |||
from .base import Pipeline | |||
from .builder import pipeline | |||
from .cv import * # noqa F403 | |||
@@ -0,0 +1,2 @@ | |||
from .linear_aec_pipeline import LinearAECPipeline | |||
from .text_to_speech_pipeline import * # noqa F403 |
@@ -0,0 +1,160 @@ | |||
import importlib | |||
import os | |||
from typing import Any, Dict | |||
import numpy as np | |||
import scipy.io.wavfile as wav | |||
import torch | |||
import yaml | |||
from modelscope.preprocessors.audio import LinearAECAndFbank | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
FEATURE_MVN = 'feature.DEY.mvn.txt' | |||
CONFIG_YAML = 'dey_mini.yaml' | |||
def initialize_config(module_cfg): | |||
r"""According to config items, load specific module dynamically with params. | |||
1. Load the module corresponding to the "module" param. | |||
2. Call function (or instantiate class) corresponding to the "main" param. | |||
3. Send the param (in "args") into the function (or class) when calling ( or instantiating). | |||
Args: | |||
module_cfg (dict): config items, eg: | |||
{ | |||
"module": "models.model", | |||
"main": "Model", | |||
"args": {...} | |||
} | |||
Returns: | |||
the module loaded. | |||
""" | |||
module = importlib.import_module(module_cfg['module']) | |||
return getattr(module, module_cfg['main'])(**module_cfg['args']) | |||
@PIPELINES.register_module( | |||
Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k') | |||
class LinearAECPipeline(Pipeline): | |||
r"""AEC Inference Pipeline only support 16000 sample rate. | |||
When invoke the class with pipeline.__call__(), you should provide two params: | |||
Dict[str, Any] | |||
the path of wav files,eg:{ | |||
"nearend_mic": "/your/data/near_end_mic_audio.wav", | |||
"farend_speech": "/your/data/far_end_speech_audio.wav"} | |||
output_path (str, optional): "/your/output/audio_after_aec.wav" | |||
the file path to write generate audio. | |||
""" | |||
def __init__(self, model): | |||
r""" | |||
Args: | |||
model: model id on modelscope hub. | |||
""" | |||
super().__init__(model=model) | |||
self.use_cuda = torch.cuda.is_available() | |||
with open( | |||
os.path.join(self.model, CONFIG_YAML), encoding='utf-8') as f: | |||
self.config = yaml.full_load(f.read()) | |||
self.config['io']['mvn'] = os.path.join(self.model, FEATURE_MVN) | |||
self._init_model() | |||
self.preprocessor = LinearAECAndFbank(self.config['io']) | |||
n_fft = self.config['loss']['args']['n_fft'] | |||
hop_length = self.config['loss']['args']['hop_length'] | |||
winlen = n_fft | |||
window = torch.hamming_window(winlen, periodic=False) | |||
def stft(x): | |||
return torch.stft( | |||
x, | |||
n_fft, | |||
hop_length, | |||
winlen, | |||
center=False, | |||
window=window.to(x.device), | |||
return_complex=False) | |||
def istft(x, slen): | |||
return torch.istft( | |||
x, | |||
n_fft, | |||
hop_length, | |||
winlen, | |||
window=window.to(x.device), | |||
center=False, | |||
length=slen) | |||
self.stft = stft | |||
self.istft = istft | |||
def _init_model(self): | |||
checkpoint = torch.load( | |||
os.path.join(self.model, ModelFile.TORCH_MODEL_BIN_FILE), | |||
map_location='cpu') | |||
self.model = initialize_config(self.config['nnet']) | |||
if self.use_cuda: | |||
self.model = self.model.cuda() | |||
self.model.load_state_dict(checkpoint) | |||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
r"""The AEC process. | |||
Args: | |||
inputs: dict={'feature': Tensor, 'base': Tensor} | |||
'feature' feature of input audio. | |||
'base' the base audio to mask. | |||
Returns: | |||
dict: | |||
{ | |||
'output_pcm': generated audio array | |||
} | |||
""" | |||
output_data = self._process(inputs['feature'], inputs['base']) | |||
return {'output_pcm': output_data} | |||
def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: | |||
r"""The post process. Will save audio to file, if the output_path is given. | |||
Args: | |||
inputs: dict: | |||
{ | |||
'output_pcm': generated audio array | |||
} | |||
kwargs: accept 'output_path' which is the path to write generated audio | |||
Returns: | |||
dict: | |||
{ | |||
'output_pcm': generated audio array | |||
} | |||
""" | |||
if 'output_path' in kwargs.keys(): | |||
wav.write(kwargs['output_path'], self.preprocessor.SAMPLE_RATE, | |||
inputs['output_pcm'].astype(np.int16)) | |||
inputs['output_pcm'] = inputs['output_pcm'] / 32768.0 | |||
return inputs | |||
def _process(self, fbanks, mixture): | |||
if self.use_cuda: | |||
fbanks = fbanks.cuda() | |||
mixture = mixture.cuda() | |||
if self.model.vad: | |||
with torch.no_grad(): | |||
masks, vad = self.model(fbanks.unsqueeze(0)) | |||
masks = masks.permute([2, 1, 0]) | |||
else: | |||
with torch.no_grad(): | |||
masks = self.model(fbanks.unsqueeze(0)) | |||
masks = masks.permute([2, 1, 0]) | |||
spectrum = self.stft(mixture) | |||
masked_spec = spectrum * masks | |||
masked_sig = self.istft(masked_spec, len(mixture)).cpu().numpy() | |||
return masked_sig |
@@ -0,0 +1,46 @@ | |||
import time | |||
from typing import Any, Dict, List | |||
import numpy as np | |||
from modelscope.models import Model | |||
from modelscope.models.audio.tts.am import SambertNetHifi16k | |||
from modelscope.models.audio.tts.vocoder import Hifigan16k | |||
from modelscope.pipelines.base import Pipeline | |||
from modelscope.pipelines.builder import PIPELINES | |||
from modelscope.preprocessors import TextToTacotronSymbols, build_preprocessor | |||
from modelscope.utils.constant import Fields, Tasks | |||
__all__ = ['TextToSpeechSambertHifigan16kPipeline'] | |||
@PIPELINES.register_module( | |||
Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k') | |||
class TextToSpeechSambertHifigan16kPipeline(Pipeline): | |||
def __init__(self, | |||
config_file: str = None, | |||
model: List[Model] = None, | |||
preprocessor: TextToTacotronSymbols = None, | |||
**kwargs): | |||
super().__init__( | |||
config_file=config_file, | |||
model=model, | |||
preprocessor=preprocessor, | |||
**kwargs) | |||
assert len(model) == 2, 'model number should be 2' | |||
self._am = model[0] | |||
self._vocoder = model[1] | |||
self._preprocessor = preprocessor | |||
def forward(self, inputs: Dict[str, Any]) -> Dict[str, np.ndarray]: | |||
texts = inputs['texts'] | |||
audio_total = np.empty((0), dtype='int16') | |||
for line in texts: | |||
line = line.strip().split('\t') | |||
audio = self._vocoder.forward(self._am.forward(line[1])) | |||
audio_total = np.append(audio_total, audio, axis=0) | |||
return {'output': audio_total} | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
return inputs |
@@ -13,18 +13,23 @@ PIPELINES = Registry('pipelines') | |||
DEFAULT_MODEL_FOR_PIPELINE = { | |||
# TaskName: (pipeline_module_name, model_repo) | |||
Tasks.word_segmentation: | |||
('structbert-chinese-word-segmentation', | |||
'damo/nlp_structbert_word-segmentation_chinese-base'), | |||
Tasks.sentence_similarity: | |||
('sbert-base-chinese-sentence-similarity', | |||
'damo/nlp_structbert_sentence-similarity_chinese-base'), | |||
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting_damo'), | |||
Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'), | |||
Tasks.text_classification: | |||
('bert-sentiment-analysis', 'damo/bert-base-sst2'), | |||
Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'), | |||
Tasks.image_captioning: ('ofa', None), | |||
Tasks.text_generation: ('palm2.0', | |||
'damo/nlp_palm2.0_text-generation_chinese-base'), | |||
Tasks.image_captioning: ('ofa', 'damo/ofa_image-caption_coco_large_en'), | |||
Tasks.image_generation: | |||
('person-image-cartoon', | |||
'damo/cv_unet_person-image-cartoon_compound-models'), | |||
Tasks.fill_mask: ('sbert', 'damo/nlp_structbert_fill-mask_chinese-large'), | |||
Tasks.ocr_detection: ('ocr-detection', | |||
'damo/cv_resnet18_ocr-detection-line-level_damo'), | |||
Tasks.fill_mask: ('veco', 'damo/nlp_veco_fill-mask_large') | |||
} | |||
@@ -1,2 +1,3 @@ | |||
from .image_cartoon_pipeline import ImageCartoonPipeline | |||
from .image_matting_pipeline import ImageMattingPipeline | |||
from .ocr_detection_pipeline import OCRDetectionPipeline |
@@ -0,0 +1,167 @@ | |||
import math | |||
import os | |||
import os.path as osp | |||
import sys | |||
from typing import Any, Dict, List, Tuple, Union | |||
import cv2 | |||
import numpy as np | |||
import PIL | |||
import tensorflow as tf | |||
import tf_slim as slim | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors import load_image | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
tf.compat.v1.disable_eager_execution() | |||
logger = get_logger() | |||
# constant | |||
RBOX_DIM = 5 | |||
OFFSET_DIM = 6 | |||
WORD_POLYGON_DIM = 8 | |||
OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1] | |||
FLAGS = tf.app.flags.FLAGS | |||
tf.app.flags.DEFINE_float('node_threshold', 0.4, | |||
'Confidence threshold for nodes') | |||
tf.app.flags.DEFINE_float('link_threshold', 0.6, | |||
'Confidence threshold for links') | |||
@PIPELINES.register_module( | |||
Tasks.ocr_detection, module_name=Tasks.ocr_detection) | |||
class OCRDetectionPipeline(Pipeline): | |||
def __init__(self, model: str): | |||
super().__init__(model=model) | |||
model_path = osp.join( | |||
osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER), | |||
'checkpoint-80000') | |||
config = tf.ConfigProto(allow_soft_placement=True) | |||
config.gpu_options.allow_growth = True | |||
self._session = tf.Session(config=config) | |||
global_step = tf.get_variable( | |||
'global_step', [], | |||
initializer=tf.constant_initializer(0), | |||
dtype=tf.int64, | |||
trainable=False) | |||
variable_averages = tf.train.ExponentialMovingAverage( | |||
0.997, global_step) | |||
self.input_images = tf.placeholder( | |||
tf.float32, shape=[1, 1024, 1024, 3], name='input_images') | |||
self.output = {} | |||
# detector | |||
detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector() | |||
all_maps = detector.build_model(self.input_images, is_training=False) | |||
# decode local predictions | |||
all_nodes, all_links, all_reg = [], [], [] | |||
for i, maps in enumerate(all_maps): | |||
cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2] | |||
reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE) | |||
cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2])) | |||
lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2]) | |||
lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:]) | |||
lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1) | |||
all_nodes.append(cls_prob) | |||
all_links.append(lnk_prob) | |||
all_reg.append(reg_maps) | |||
# decode segments and links | |||
image_size = tf.shape(self.input_images)[1:3] | |||
segments, group_indices, segment_counts, _ = ops.decode_segments_links_python( | |||
image_size, | |||
all_nodes, | |||
all_links, | |||
all_reg, | |||
anchor_sizes=list(detector.anchor_sizes)) | |||
# combine segments | |||
combined_rboxes, combined_counts = ops.combine_segments_python( | |||
segments, group_indices, segment_counts) | |||
self.output['combined_rboxes'] = combined_rboxes | |||
self.output['combined_counts'] = combined_counts | |||
with self._session.as_default() as sess: | |||
logger.info(f'loading model from {model_path}') | |||
# load model | |||
model_loader = tf.train.Saver( | |||
variable_averages.variables_to_restore()) | |||
model_loader.restore(sess, model_path) | |||
def preprocess(self, input: Input) -> Dict[str, Any]: | |||
if isinstance(input, str): | |||
img = np.array(load_image(input)) | |||
elif isinstance(input, PIL.Image.Image): | |||
img = np.array(input.convert('RGB')) | |||
elif isinstance(input, np.ndarray): | |||
if len(input.shape) == 2: | |||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) | |||
img = input[:, :, ::-1] # in rgb order | |||
else: | |||
raise TypeError(f'input should be either str, PIL.Image,' | |||
f' np.array, but got {type(input)}') | |||
h, w, c = img.shape | |||
img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32) | |||
img_pad[:h, :w, :] = img | |||
resize_size = 1024 | |||
img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size)) | |||
img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR) | |||
img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94], | |||
dtype=np.float32) | |||
resize_size = tf.stack([resize_size, resize_size]) | |||
orig_size = tf.stack([max(h, w), max(h, w)]) | |||
self.output['orig_size'] = orig_size | |||
self.output['resize_size'] = resize_size | |||
result = {'img': np.expand_dims(img_pad_resize, axis=0)} | |||
return result | |||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
with self._session.as_default(): | |||
feed_dict = {self.input_images: input['img']} | |||
sess_outputs = self._session.run(self.output, feed_dict=feed_dict) | |||
return sess_outputs | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
rboxes = inputs['combined_rboxes'][0] | |||
count = inputs['combined_counts'][0] | |||
rboxes = rboxes[:count, :] | |||
# convert rboxes to polygons and find its coordinates on the original image | |||
orig_h, orig_w = inputs['orig_size'] | |||
resize_h, resize_w = inputs['resize_size'] | |||
polygons = utils.rboxes_to_polygons(rboxes) | |||
scale_y = float(orig_h) / float(resize_h) | |||
scale_x = float(orig_w) / float(resize_w) | |||
# confine polygons inside image | |||
polygons[:, ::2] = np.maximum( | |||
0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1)) | |||
polygons[:, 1::2] = np.maximum( | |||
0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1)) | |||
polygons = np.round(polygons).astype(np.int32) | |||
# nms | |||
dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()] | |||
dt_nms = utils.nms_python(dt_n9) | |||
dt_polygons = np.array([o[:8] for o in dt_nms]) | |||
result = {'det_polygons': dt_polygons} | |||
return result |
@@ -0,0 +1,158 @@ | |||
import tensorflow as tf | |||
import tf_slim as slim | |||
from . import ops, resnet18_v1, resnet_utils | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
# constants | |||
OFFSET_DIM = 6 | |||
N_LOCAL_LINKS = 8 | |||
N_CROSS_LINKS = 4 | |||
N_SEG_CLASSES = 2 | |||
N_LNK_CLASSES = 4 | |||
POS_LABEL = 1 | |||
NEG_LABEL = 0 | |||
class SegLinkDetector(): | |||
def __init__(self): | |||
self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.] | |||
def _detection_classifier(self, | |||
maps, | |||
ksize, | |||
weight_decay, | |||
cross_links=False, | |||
scope=None): | |||
with tf.variable_scope(scope): | |||
seg_depth = N_SEG_CLASSES | |||
if cross_links: | |||
lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS) | |||
else: | |||
lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS | |||
reg_depth = OFFSET_DIM | |||
map_depth = maps.get_shape()[3] | |||
inter_maps, inter_relu = ops.conv2d( | |||
maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter') | |||
dir_maps, dir_relu = ops.conv2d( | |||
inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir') | |||
cen_maps, cen_relu = ops.conv2d( | |||
inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen') | |||
pol_maps, pol_relu = ops.conv2d( | |||
inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol') | |||
concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1) | |||
_, lnk_embedding = ops.conv_relu( | |||
concat_relu, 12, 256, 1, 1, scope='lnk_embedding') | |||
lnk_maps, lnk_relu = ops.conv2d( | |||
inter_relu + lnk_embedding, | |||
256, | |||
lnk_depth, | |||
ksize, | |||
1, | |||
'SAME', | |||
scope='conv_lnk') | |||
char_seg_maps, char_seg_relu = ops.conv2d( | |||
inter_relu, | |||
256, | |||
seg_depth, | |||
ksize, | |||
1, | |||
'SAME', | |||
scope='conv_char_cls') | |||
char_reg_maps, char_reg_relu = ops.conv2d( | |||
inter_relu, | |||
256, | |||
reg_depth, | |||
ksize, | |||
1, | |||
'SAME', | |||
scope='conv_char_reg') | |||
concat_char_relu = tf.concat([char_seg_relu, char_reg_relu], | |||
axis=-1) | |||
_, char_embedding = ops.conv_relu( | |||
concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding') | |||
seg_maps, seg_relu = ops.conv2d( | |||
inter_relu + char_embedding, | |||
256, | |||
seg_depth, | |||
ksize, | |||
1, | |||
'SAME', | |||
scope='conv_cls') | |||
reg_maps, reg_relu = ops.conv2d( | |||
inter_relu + char_embedding, | |||
256, | |||
reg_depth, | |||
ksize, | |||
1, | |||
'SAME', | |||
scope='conv_reg') | |||
return seg_relu, lnk_relu, reg_relu | |||
def _build_cnn(self, images, weight_decay, is_training): | |||
with slim.arg_scope( | |||
resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)): | |||
logits, end_points = resnet18_v1.resnet_v1_18( | |||
images, is_training=is_training, scope='resnet_v1_18') | |||
outputs = { | |||
'conv3_3': end_points['pool1'], | |||
'conv4_3': end_points['pool2'], | |||
'fc7': end_points['pool3'], | |||
'conv8_2': end_points['pool4'], | |||
'conv9_2': end_points['pool5'], | |||
'conv10_2': end_points['pool6'], | |||
} | |||
return outputs | |||
def build_model(self, images, is_training=True, scope=None): | |||
weight_decay = 5e-4 # FLAGS.weight_decay | |||
cnn_outputs = self._build_cnn(images, weight_decay, is_training) | |||
det_0 = self._detection_classifier( | |||
cnn_outputs['conv3_3'], | |||
3, | |||
weight_decay, | |||
cross_links=False, | |||
scope='dete_0') | |||
det_1 = self._detection_classifier( | |||
cnn_outputs['conv4_3'], | |||
3, | |||
weight_decay, | |||
cross_links=True, | |||
scope='dete_1') | |||
det_2 = self._detection_classifier( | |||
cnn_outputs['fc7'], | |||
3, | |||
weight_decay, | |||
cross_links=True, | |||
scope='dete_2') | |||
det_3 = self._detection_classifier( | |||
cnn_outputs['conv8_2'], | |||
3, | |||
weight_decay, | |||
cross_links=True, | |||
scope='dete_3') | |||
det_4 = self._detection_classifier( | |||
cnn_outputs['conv9_2'], | |||
3, | |||
weight_decay, | |||
cross_links=True, | |||
scope='dete_4') | |||
det_5 = self._detection_classifier( | |||
cnn_outputs['conv10_2'], | |||
3, | |||
weight_decay, | |||
cross_links=True, | |||
scope='dete_5') | |||
outputs = [det_0, det_1, det_2, det_3, det_4, det_5] | |||
return outputs |
@@ -0,0 +1,432 @@ | |||
"""Contains definitions for the original form of Residual Networks. | |||
The 'v1' residual networks (ResNets) implemented in this module were proposed | |||
by: | |||
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||
Deep Residual Learning for Image Recognition. arXiv:1512.03385 | |||
Other variants were introduced in: | |||
[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||
Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 | |||
The networks defined in this module utilize the bottleneck building block of | |||
[1] with projection shortcuts only for increasing depths. They employ batch | |||
normalization *after* every weight layer. This is the architecture used by | |||
MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and | |||
ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' | |||
architecture and the alternative 'v2' architecture of [2] which uses batch | |||
normalization *before* every weight layer in the so-called full pre-activation | |||
units. | |||
Typical use: | |||
from tensorflow.contrib.slim.nets import resnet_v1 | |||
ResNet-101 for image classification into 1000 classes: | |||
# inputs has shape [batch, 224, 224, 3] | |||
with slim.arg_scope(resnet_v1.resnet_arg_scope()): | |||
net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) | |||
ResNet-101 for semantic segmentation into 21 classes: | |||
# inputs has shape [batch, 513, 513, 3] | |||
with slim.arg_scope(resnet_v1.resnet_arg_scope()): | |||
net, end_points = resnet_v1.resnet_v1_101(inputs, | |||
21, | |||
is_training=False, | |||
global_pool=False, | |||
output_stride=16) | |||
""" | |||
import tensorflow as tf | |||
import tf_slim as slim | |||
from . import resnet_utils | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
resnet_arg_scope = resnet_utils.resnet_arg_scope | |||
@slim.add_arg_scope | |||
def basicblock(inputs, | |||
depth, | |||
depth_bottleneck, | |||
stride, | |||
rate=1, | |||
outputs_collections=None, | |||
scope=None): | |||
"""Bottleneck residual unit variant with BN after convolutions. | |||
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for | |||
its definition. Note that we use here the bottleneck variant which has an | |||
extra bottleneck layer. | |||
When putting together two consecutive ResNet blocks that use this unit, one | |||
should use stride = 2 in the last unit of the first block. | |||
Args: | |||
inputs: A tensor of size [batch, height, width, channels]. | |||
depth: The depth of the ResNet unit output. | |||
depth_bottleneck: The depth of the bottleneck layers. | |||
stride: The ResNet unit's stride. Determines the amount of downsampling of | |||
the units output compared to its input. | |||
rate: An integer, rate for atrous convolution. | |||
outputs_collections: Collection to add the ResNet unit output. | |||
scope: Optional variable_scope. | |||
Returns: | |||
The ResNet unit's output. | |||
""" | |||
with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: | |||
depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) | |||
if depth == depth_in: | |||
shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') | |||
else: | |||
shortcut = slim.conv2d( | |||
inputs, | |||
depth, [1, 1], | |||
stride=stride, | |||
activation_fn=None, | |||
scope='shortcut') | |||
residual = resnet_utils.conv2d_same( | |||
inputs, depth, 3, stride, rate=rate, scope='conv1') | |||
residual = resnet_utils.conv2d_same( | |||
residual, depth, 3, 1, rate=rate, scope='conv2') | |||
output = tf.nn.relu(residual + shortcut) | |||
return slim.utils.collect_named_outputs(outputs_collections, | |||
sc.original_name_scope, output) | |||
@slim.add_arg_scope | |||
def bottleneck(inputs, | |||
depth, | |||
depth_bottleneck, | |||
stride, | |||
rate=1, | |||
outputs_collections=None, | |||
scope=None): | |||
"""Bottleneck residual unit variant with BN after convolutions. | |||
This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for | |||
its definition. Note that we use here the bottleneck variant which has an | |||
extra bottleneck layer. | |||
When putting together two consecutive ResNet blocks that use this unit, one | |||
should use stride = 2 in the last unit of the first block. | |||
Args: | |||
inputs: A tensor of size [batch, height, width, channels]. | |||
depth: The depth of the ResNet unit output. | |||
depth_bottleneck: The depth of the bottleneck layers. | |||
stride: The ResNet unit's stride. Determines the amount of downsampling of | |||
the units output compared to its input. | |||
rate: An integer, rate for atrous convolution. | |||
outputs_collections: Collection to add the ResNet unit output. | |||
scope: Optional variable_scope. | |||
Returns: | |||
The ResNet unit's output. | |||
""" | |||
with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: | |||
depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) | |||
if depth == depth_in: | |||
shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') | |||
else: | |||
shortcut = slim.conv2d( | |||
inputs, | |||
depth, [1, 1], | |||
stride=stride, | |||
activation_fn=None, | |||
scope='shortcut') | |||
residual = slim.conv2d( | |||
inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') | |||
residual = resnet_utils.conv2d_same( | |||
residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') | |||
residual = slim.conv2d( | |||
residual, | |||
depth, [1, 1], | |||
stride=1, | |||
activation_fn=None, | |||
scope='conv3') | |||
output = tf.nn.relu(shortcut + residual) | |||
return slim.utils.collect_named_outputs(outputs_collections, | |||
sc.original_name_scope, output) | |||
def resnet_v1(inputs, | |||
blocks, | |||
num_classes=None, | |||
is_training=True, | |||
global_pool=True, | |||
output_stride=None, | |||
include_root_block=True, | |||
spatial_squeeze=True, | |||
reuse=None, | |||
scope=None): | |||
"""Generator for v1 ResNet models. | |||
This function generates a family of ResNet v1 models. See the resnet_v1_*() | |||
methods for specific model instantiations, obtained by selecting different | |||
block instantiations that produce ResNets of various depths. | |||
Training for image classification on Imagenet is usually done with [224, 224] | |||
inputs, resulting in [7, 7] feature maps at the output of the last ResNet | |||
block for the ResNets defined in [1] that have nominal stride equal to 32. | |||
However, for dense prediction tasks we advise that one uses inputs with | |||
spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In | |||
this case the feature maps at the ResNet output will have spatial shape | |||
[(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] | |||
and corners exactly aligned with the input image corners, which greatly | |||
facilitates alignment of the features to the image. Using as input [225, 225] | |||
images results in [8, 8] feature maps at the output of the last ResNet block. | |||
For dense prediction tasks, the ResNet needs to run in fully-convolutional | |||
(FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all | |||
have nominal stride equal to 32 and a good choice in FCN mode is to use | |||
output_stride=16 in order to increase the density of the computed features at | |||
small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. | |||
Args: | |||
inputs: A tensor of size [batch, height_in, width_in, channels]. | |||
blocks: A list of length equal to the number of ResNet blocks. Each element | |||
is a resnet_utils.Block object describing the units in the block. | |||
num_classes: Number of predicted classes for classification tasks. If None | |||
we return the features before the logit layer. | |||
is_training: whether is training or not. | |||
global_pool: If True, we perform global average pooling before computing the | |||
logits. Set to True for image classification, False for dense prediction. | |||
output_stride: If None, then the output will be computed at the nominal | |||
network stride. If output_stride is not None, it specifies the requested | |||
ratio of input to output spatial resolution. | |||
include_root_block: If True, include the initial convolution followed by | |||
max-pooling, if False excludes it. | |||
spatial_squeeze: if True, logits is of shape [B, C], if false logits is | |||
of shape [B, 1, 1, C], where B is batch_size and C is number of classes. | |||
reuse: whether or not the network and its variables should be reused. To be | |||
able to reuse 'scope' must be given. | |||
scope: Optional variable_scope. | |||
Returns: | |||
net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. | |||
If global_pool is False, then height_out and width_out are reduced by a | |||
factor of output_stride compared to the respective height_in and width_in, | |||
else both height_out and width_out equal one. If num_classes is None, then | |||
net is the output of the last ResNet block, potentially after global | |||
average pooling. If num_classes is not None, net contains the pre-softmax | |||
activations. | |||
end_points: A dictionary from components of the network to the corresponding | |||
activation. | |||
Raises: | |||
ValueError: If the target output_stride is not valid. | |||
""" | |||
with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: | |||
end_points_collection = sc.name + '_end_points' | |||
with slim.arg_scope( | |||
[slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], | |||
outputs_collections=end_points_collection): | |||
with slim.arg_scope([slim.batch_norm], is_training=is_training): | |||
net = inputs | |||
if include_root_block: | |||
if output_stride is not None: | |||
if output_stride % 4 != 0: | |||
raise ValueError( | |||
'The output_stride needs to be a multiple of 4.' | |||
) | |||
output_stride /= 4 | |||
net = resnet_utils.conv2d_same( | |||
net, 64, 7, stride=2, scope='conv1') | |||
net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]]) | |||
net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') | |||
net = slim.utils.collect_named_outputs( | |||
end_points_collection, 'pool2', net) | |||
net = resnet_utils.stack_blocks_dense(net, blocks, | |||
output_stride) | |||
end_points = slim.utils.convert_collection_to_dict( | |||
end_points_collection) | |||
end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2'] | |||
end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2'] | |||
end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2'] | |||
end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2'] | |||
end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2'] | |||
end_points['pool6'] = net | |||
return net, end_points | |||
resnet_v1.default_image_size = 224 | |||
def resnet_v1_18(inputs, | |||
num_classes=None, | |||
is_training=True, | |||
global_pool=True, | |||
output_stride=None, | |||
spatial_squeeze=True, | |||
reuse=None, | |||
scope='resnet_v1_18'): | |||
"""ResNet-18 model of [1]. See resnet_v1() for arg and return description.""" | |||
blocks = [ | |||
resnet_utils.Block('block1', basicblock, | |||
[(64, 64, 1)] + [(64, 64, 1)]), | |||
resnet_utils.Block('block2', basicblock, | |||
[(128, 128, 1)] + [(128, 128, 1)]), | |||
resnet_utils.Block('block3', basicblock, | |||
[(256, 256, 2)] + [(256, 256, 1)]), | |||
resnet_utils.Block('block4', basicblock, | |||
[(512, 512, 2)] + [(512, 512, 1)]), | |||
resnet_utils.Block('block5', basicblock, | |||
[(256, 256, 2)] + [(256, 256, 1)]), | |||
resnet_utils.Block('block6', basicblock, | |||
[(256, 256, 2)] + [(256, 256, 1)]), | |||
resnet_utils.Block('block7', basicblock, | |||
[(256, 256, 2)] + [(256, 256, 1)]), | |||
] | |||
return resnet_v1( | |||
inputs, | |||
blocks, | |||
num_classes, | |||
is_training, | |||
global_pool=global_pool, | |||
output_stride=output_stride, | |||
include_root_block=True, | |||
spatial_squeeze=spatial_squeeze, | |||
reuse=reuse, | |||
scope=scope) | |||
resnet_v1_18.default_image_size = resnet_v1.default_image_size | |||
def resnet_v1_50(inputs, | |||
num_classes=None, | |||
is_training=True, | |||
global_pool=True, | |||
output_stride=None, | |||
spatial_squeeze=True, | |||
reuse=None, | |||
scope='resnet_v1_50'): | |||
"""ResNet-50 model of [1]. See resnet_v1() for arg and return description.""" | |||
blocks = [ | |||
resnet_utils.Block('block1', bottleneck, | |||
[(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||
resnet_utils.Block('block2', bottleneck, | |||
[(512, 128, 1)] * 3 + [(512, 128, 2)]), | |||
resnet_utils.Block('block3', bottleneck, | |||
[(1024, 256, 1)] * 5 + [(1024, 256, 2)]), | |||
resnet_utils.Block('block4', bottleneck, | |||
[(2048, 512, 1)] * 3 + [(2048, 512, 2)]), | |||
resnet_utils.Block('block5', bottleneck, | |||
[(1024, 256, 1)] * 2 + [(1024, 256, 2)]), | |||
resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2), | |||
] | |||
return resnet_v1( | |||
inputs, | |||
blocks, | |||
num_classes, | |||
is_training, | |||
global_pool=global_pool, | |||
output_stride=output_stride, | |||
include_root_block=True, | |||
spatial_squeeze=spatial_squeeze, | |||
reuse=reuse, | |||
scope=scope) | |||
resnet_v1_50.default_image_size = resnet_v1.default_image_size | |||
def resnet_v1_101(inputs, | |||
num_classes=None, | |||
is_training=True, | |||
global_pool=True, | |||
output_stride=None, | |||
spatial_squeeze=True, | |||
reuse=None, | |||
scope='resnet_v1_101'): | |||
"""ResNet-101 model of [1]. See resnet_v1() for arg and return description.""" | |||
blocks = [ | |||
resnet_utils.Block('block1', bottleneck, | |||
[(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||
resnet_utils.Block('block2', bottleneck, | |||
[(512, 128, 1)] * 3 + [(512, 128, 2)]), | |||
resnet_utils.Block('block3', bottleneck, | |||
[(1024, 256, 1)] * 22 + [(1024, 256, 2)]), | |||
resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) | |||
] | |||
return resnet_v1( | |||
inputs, | |||
blocks, | |||
num_classes, | |||
is_training, | |||
global_pool=global_pool, | |||
output_stride=output_stride, | |||
include_root_block=True, | |||
spatial_squeeze=spatial_squeeze, | |||
reuse=reuse, | |||
scope=scope) | |||
resnet_v1_101.default_image_size = resnet_v1.default_image_size | |||
def resnet_v1_152(inputs, | |||
num_classes=None, | |||
is_training=True, | |||
global_pool=True, | |||
output_stride=None, | |||
spatial_squeeze=True, | |||
reuse=None, | |||
scope='resnet_v1_152'): | |||
"""ResNet-152 model of [1]. See resnet_v1() for arg and return description.""" | |||
blocks = [ | |||
resnet_utils.Block('block1', bottleneck, | |||
[(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||
resnet_utils.Block('block2', bottleneck, | |||
[(512, 128, 1)] * 7 + [(512, 128, 2)]), | |||
resnet_utils.Block('block3', bottleneck, | |||
[(1024, 256, 1)] * 35 + [(1024, 256, 2)]), | |||
resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) | |||
] | |||
return resnet_v1( | |||
inputs, | |||
blocks, | |||
num_classes, | |||
is_training, | |||
global_pool=global_pool, | |||
output_stride=output_stride, | |||
include_root_block=True, | |||
spatial_squeeze=spatial_squeeze, | |||
reuse=reuse, | |||
scope=scope) | |||
resnet_v1_152.default_image_size = resnet_v1.default_image_size | |||
def resnet_v1_200(inputs, | |||
num_classes=None, | |||
is_training=True, | |||
global_pool=True, | |||
output_stride=None, | |||
spatial_squeeze=True, | |||
reuse=None, | |||
scope='resnet_v1_200'): | |||
"""ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" | |||
blocks = [ | |||
resnet_utils.Block('block1', bottleneck, | |||
[(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||
resnet_utils.Block('block2', bottleneck, | |||
[(512, 128, 1)] * 23 + [(512, 128, 2)]), | |||
resnet_utils.Block('block3', bottleneck, | |||
[(1024, 256, 1)] * 35 + [(1024, 256, 2)]), | |||
resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) | |||
] | |||
return resnet_v1( | |||
inputs, | |||
blocks, | |||
num_classes, | |||
is_training, | |||
global_pool=global_pool, | |||
output_stride=output_stride, | |||
include_root_block=True, | |||
spatial_squeeze=spatial_squeeze, | |||
reuse=reuse, | |||
scope=scope) | |||
resnet_v1_200.default_image_size = resnet_v1.default_image_size | |||
if __name__ == '__main__': | |||
input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input') | |||
with slim.arg_scope(resnet_arg_scope()) as sc: | |||
logits = resnet_v1_50(input) |
@@ -0,0 +1,231 @@ | |||
"""Contains building blocks for various versions of Residual Networks. | |||
Residual networks (ResNets) were proposed in: | |||
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||
Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 | |||
More variants were introduced in: | |||
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||
Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 | |||
We can obtain different ResNet variants by changing the network depth, width, | |||
and form of residual unit. This module implements the infrastructure for | |||
building them. Concrete ResNet units and full ResNet networks are implemented in | |||
the accompanying resnet_v1.py and resnet_v2.py modules. | |||
Compared to https://github.com/KaimingHe/deep-residual-networks, in the current | |||
implementation we subsample the output activations in the last residual unit of | |||
each block, instead of subsampling the input activations in the first residual | |||
unit of each block. The two implementations give identical results but our | |||
implementation is more memory efficient. | |||
""" | |||
import collections | |||
import tensorflow as tf | |||
import tf_slim as slim | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): | |||
"""A named tuple describing a ResNet block. | |||
Its parts are: | |||
scope: The scope of the `Block`. | |||
unit_fn: The ResNet unit function which takes as input a `Tensor` and | |||
returns another `Tensor` with the output of the ResNet unit. | |||
args: A list of length equal to the number of units in the `Block`. The list | |||
contains one (depth, depth_bottleneck, stride) tuple for each unit in the | |||
block to serve as argument to unit_fn. | |||
""" | |||
def subsample(inputs, factor, scope=None): | |||
"""Subsamples the input along the spatial dimensions. | |||
Args: | |||
inputs: A `Tensor` of size [batch, height_in, width_in, channels]. | |||
factor: The subsampling factor. | |||
scope: Optional variable_scope. | |||
Returns: | |||
output: A `Tensor` of size [batch, height_out, width_out, channels] with the | |||
input, either intact (if factor == 1) or subsampled (if factor > 1). | |||
""" | |||
if factor == 1: | |||
return inputs | |||
else: | |||
return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) | |||
def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): | |||
"""Strided 2-D convolution with 'SAME' padding. | |||
When stride > 1, then we do explicit zero-padding, followed by conv2d with | |||
'VALID' padding. | |||
Note that | |||
net = conv2d_same(inputs, num_outputs, 3, stride=stride) | |||
is equivalent to | |||
net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') | |||
net = subsample(net, factor=stride) | |||
whereas | |||
net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') | |||
is different when the input's height or width is even, which is why we add the | |||
current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). | |||
Args: | |||
inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. | |||
num_outputs: An integer, the number of output filters. | |||
kernel_size: An int with the kernel_size of the filters. | |||
stride: An integer, the output stride. | |||
rate: An integer, rate for atrous convolution. | |||
scope: Scope. | |||
Returns: | |||
output: A 4-D tensor of size [batch, height_out, width_out, channels] with | |||
the convolution output. | |||
""" | |||
if stride == 1: | |||
return slim.conv2d( | |||
inputs, | |||
num_outputs, | |||
kernel_size, | |||
stride=1, | |||
rate=rate, | |||
padding='SAME', | |||
scope=scope) | |||
else: | |||
kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) | |||
pad_total = kernel_size_effective - 1 | |||
pad_beg = pad_total // 2 | |||
pad_end = pad_total - pad_beg | |||
inputs = tf.pad( | |||
inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) | |||
return slim.conv2d( | |||
inputs, | |||
num_outputs, | |||
kernel_size, | |||
stride=stride, | |||
rate=rate, | |||
padding='VALID', | |||
scope=scope) | |||
@slim.add_arg_scope | |||
def stack_blocks_dense(net, | |||
blocks, | |||
output_stride=None, | |||
outputs_collections=None): | |||
"""Stacks ResNet `Blocks` and controls output feature density. | |||
First, this function creates scopes for the ResNet in the form of | |||
'block_name/unit_1', 'block_name/unit_2', etc. | |||
Second, this function allows the user to explicitly control the ResNet | |||
output_stride, which is the ratio of the input to output spatial resolution. | |||
This is useful for dense prediction tasks such as semantic segmentation or | |||
object detection. | |||
Most ResNets consist of 4 ResNet blocks and subsample the activations by a | |||
factor of 2 when transitioning between consecutive ResNet blocks. This results | |||
to a nominal ResNet output_stride equal to 8. If we set the output_stride to | |||
half the nominal network stride (e.g., output_stride=4), then we compute | |||
responses twice. | |||
Control of the output feature density is implemented by atrous convolution. | |||
Args: | |||
net: A `Tensor` of size [batch, height, width, channels]. | |||
blocks: A list of length equal to the number of ResNet `Blocks`. Each | |||
element is a ResNet `Block` object describing the units in the `Block`. | |||
output_stride: If `None`, then the output will be computed at the nominal | |||
network stride. If output_stride is not `None`, it specifies the requested | |||
ratio of input to output spatial resolution, which needs to be equal to | |||
the product of unit strides from the start up to some level of the ResNet. | |||
For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, | |||
then valid values for the output_stride are 1, 2, 6, 24 or None (which | |||
is equivalent to output_stride=24). | |||
outputs_collections: Collection to add the ResNet block outputs. | |||
Returns: | |||
net: Output tensor with stride equal to the specified output_stride. | |||
Raises: | |||
ValueError: If the target output_stride is not valid. | |||
""" | |||
# The current_stride variable keeps track of the effective stride of the | |||
# activations. This allows us to invoke atrous convolution whenever applying | |||
# the next residual unit would result in the activations having stride larger | |||
# than the target output_stride. | |||
current_stride = 1 | |||
# The atrous convolution rate parameter. | |||
rate = 1 | |||
for block in blocks: | |||
with tf.variable_scope(block.scope, 'block', [net]): | |||
for i, unit in enumerate(block.args): | |||
if output_stride is not None and current_stride > output_stride: | |||
raise ValueError( | |||
'The target output_stride cannot be reached.') | |||
with tf.variable_scope( | |||
'unit_%d' % (i + 1), values=[net]) as sc: | |||
unit_depth, unit_depth_bottleneck, unit_stride = unit | |||
# If we have reached the target output_stride, then we need to employ | |||
# atrous convolution with stride=1 and multiply the atrous rate by the | |||
# current unit's stride for use in subsequent layers. | |||
if output_stride is not None and current_stride == output_stride: | |||
net = block.unit_fn( | |||
net, | |||
depth=unit_depth, | |||
depth_bottleneck=unit_depth_bottleneck, | |||
stride=1, | |||
rate=rate) | |||
rate *= unit_stride | |||
else: | |||
net = block.unit_fn( | |||
net, | |||
depth=unit_depth, | |||
depth_bottleneck=unit_depth_bottleneck, | |||
stride=unit_stride, | |||
rate=1) | |||
current_stride *= unit_stride | |||
net = slim.utils.collect_named_outputs( | |||
outputs_collections, sc.name, net) | |||
if output_stride is not None and current_stride != output_stride: | |||
raise ValueError('The target output_stride cannot be reached.') | |||
return net | |||
def resnet_arg_scope(weight_decay=0.0001, | |||
batch_norm_decay=0.997, | |||
batch_norm_epsilon=1e-5, | |||
batch_norm_scale=True): | |||
"""Defines the default ResNet arg scope. | |||
TODO(gpapan): The batch-normalization related default values above are | |||
appropriate for use in conjunction with the reference ResNet models | |||
released at https://github.com/KaimingHe/deep-residual-networks. When | |||
training ResNets from scratch, they might need to be tuned. | |||
Args: | |||
weight_decay: The weight decay to use for regularizing the model. | |||
batch_norm_decay: The moving average decay when estimating layer activation | |||
statistics in batch normalization. | |||
batch_norm_epsilon: Small constant to prevent division by zero when | |||
normalizing activations by their variance in batch normalization. | |||
batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the | |||
activations in the batch normalization layer. | |||
Returns: | |||
An `arg_scope` to use for the resnet models. | |||
""" | |||
batch_norm_params = { | |||
'decay': batch_norm_decay, | |||
'epsilon': batch_norm_epsilon, | |||
'scale': batch_norm_scale, | |||
'updates_collections': tf.GraphKeys.UPDATE_OPS, | |||
} | |||
with slim.arg_scope( | |||
[slim.conv2d], | |||
weights_regularizer=slim.l2_regularizer(weight_decay), | |||
weights_initializer=slim.variance_scaling_initializer(), | |||
activation_fn=tf.nn.relu, | |||
normalizer_fn=slim.batch_norm, | |||
normalizer_params=batch_norm_params): | |||
with slim.arg_scope([slim.batch_norm], **batch_norm_params): | |||
# The following implies padding='SAME' for pool1, which makes feature | |||
# alignment easier for dense prediction tasks. This is also used in | |||
# https://github.com/facebook/fb.resnet.torch. However the accompanying | |||
# code of 'Deep Residual Learning for Image Recognition' uses | |||
# padding='VALID' for pool1. You can switch to that choice by setting | |||
# slim.arg_scope([slim.max_pool2d], padding='VALID'). | |||
with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: | |||
return arg_sc |
@@ -0,0 +1,108 @@ | |||
import cv2 | |||
import numpy as np | |||
def rboxes_to_polygons(rboxes): | |||
""" | |||
Convert rboxes to polygons | |||
ARGS | |||
`rboxes`: [n, 5] | |||
RETURN | |||
`polygons`: [n, 8] | |||
""" | |||
theta = rboxes[:, 4:5] | |||
cxcy = rboxes[:, :2] | |||
half_w = rboxes[:, 2:3] / 2. | |||
half_h = rboxes[:, 3:4] / 2. | |||
v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w]) | |||
v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h]) | |||
p1 = cxcy - v1 - v2 | |||
p2 = cxcy + v1 - v2 | |||
p3 = cxcy + v1 + v2 | |||
p4 = cxcy - v1 + v2 | |||
polygons = np.hstack([p1, p2, p3, p4]) | |||
return polygons | |||
def cal_width(box): | |||
pd1 = point_dist(box[0], box[1], box[2], box[3]) | |||
pd2 = point_dist(box[4], box[5], box[6], box[7]) | |||
return (pd1 + pd2) / 2 | |||
def point_dist(x1, y1, x2, y2): | |||
return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1)) | |||
def draw_polygons(img, polygons): | |||
for p in polygons.tolist(): | |||
p = [int(o) for o in p] | |||
cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1) | |||
cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1) | |||
cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1) | |||
cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1) | |||
return img | |||
def nms_python(boxes): | |||
boxes = sorted(boxes, key=lambda x: -x[8]) | |||
nms_flag = [True] * len(boxes) | |||
for i, a in enumerate(boxes): | |||
if not nms_flag[i]: | |||
continue | |||
else: | |||
for j, b in enumerate(boxes): | |||
if not j > i: | |||
continue | |||
if not nms_flag[j]: | |||
continue | |||
score_a = a[8] | |||
score_b = b[8] | |||
rbox_a = polygon2rbox(a[:8]) | |||
rbox_b = polygon2rbox(b[:8]) | |||
if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox( | |||
rbox_b[:2], rbox_a): | |||
if score_a > score_b: | |||
nms_flag[j] = False | |||
boxes_nms = [] | |||
for i, box in enumerate(boxes): | |||
if nms_flag[i]: | |||
boxes_nms.append(box) | |||
return boxes_nms | |||
def point_in_rbox(c, rbox): | |||
cx0, cy0 = c[0], c[1] | |||
cx1, cy1 = rbox[0], rbox[1] | |||
w, h = rbox[2], rbox[3] | |||
theta = rbox[4] | |||
dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta)) | |||
dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta)) | |||
return ((dist_x < w / 2.0) and (dist_y < h / 2.0)) | |||
def polygon2rbox(polygon): | |||
x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6] | |||
y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7] | |||
c_x = (x1 + x2 + x3 + x4) / 4 | |||
c_y = (y1 + y2 + y3 + y4) / 4 | |||
w1 = point_dist(x1, y1, x2, y2) | |||
w2 = point_dist(x3, y3, x4, y4) | |||
h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2) | |||
h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4) | |||
h = h1 + h2 | |||
w = (w1 + w2) / 2 | |||
theta1 = np.arctan2(y2 - y1, x2 - x1) | |||
theta2 = np.arctan2(y3 - y4, x3 - x4) | |||
theta = (theta1 + theta2) / 2.0 | |||
return [c_x, c_y, w, h, theta] | |||
def point_line_dist(px, py, x1, y1, x2, y2): | |||
eps = 1e-6 | |||
dx = x2 - x1 | |||
dy = y2 - y1 | |||
div = np.sqrt(dx * dx + dy * dy) + eps | |||
dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div | |||
return dist |
@@ -1 +1 @@ | |||
from .image_captioning import ImageCaptionPipeline | |||
from .image_captioning_pipeline import ImageCaptionPipeline |
@@ -0,0 +1,33 @@ | |||
from typing import Any, Dict, Union | |||
from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Model, Pipeline | |||
from ..builder import PIPELINES | |||
logger = get_logger() | |||
@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa') | |||
class ImageCaptionPipeline(Pipeline): | |||
def __init__(self, | |||
model: Union[Model, str], | |||
preprocessor: [Preprocessor] = None, | |||
**kwargs): | |||
super().__init__() | |||
assert isinstance(model, str) or isinstance(model, Model), \ | |||
'model must be a single str or OfaForImageCaptioning' | |||
if isinstance(model, str): | |||
pipe_model = Model.from_pretrained(model) | |||
elif isinstance(model, Model): | |||
pipe_model = model | |||
else: | |||
raise NotImplementedError | |||
if preprocessor is None and pipe_model: | |||
preprocessor = OfaImageCaptionPreprocessor(model_dir=model) | |||
super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs) | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
return inputs |
@@ -2,3 +2,4 @@ from .fill_mask_pipeline import * # noqa F403 | |||
from .sentence_similarity_pipeline import * # noqa F403 | |||
from .sequence_classification_pipeline import * # noqa F403 | |||
from .text_generation_pipeline import * # noqa F403 | |||
from .word_segmentation_pipeline import * # noqa F403 |
@@ -1,8 +1,5 @@ | |||
import os | |||
import uuid | |||
from typing import Any, Dict, Union | |||
import json | |||
import numpy as np | |||
from modelscope.models.nlp import SbertForSentenceSimilarity | |||
@@ -1,8 +1,5 @@ | |||
import os | |||
import uuid | |||
from typing import Any, Dict, Union | |||
import json | |||
import numpy as np | |||
from modelscope.models.nlp import BertForSequenceClassification | |||
@@ -1,7 +1,7 @@ | |||
from typing import Dict, Optional, Union | |||
from modelscope.models import Model | |||
from modelscope.models.nlp import PalmForTextGenerationModel | |||
from modelscope.models.nlp import PalmForTextGeneration | |||
from modelscope.preprocessors import TextGenerationPreprocessor | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Pipeline, Tensor | |||
@@ -10,11 +10,11 @@ from ..builder import PIPELINES | |||
__all__ = ['TextGenerationPipeline'] | |||
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm') | |||
@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0') | |||
class TextGenerationPipeline(Pipeline): | |||
def __init__(self, | |||
model: Union[PalmForTextGenerationModel, str], | |||
model: Union[PalmForTextGeneration, str], | |||
preprocessor: Optional[TextGenerationPreprocessor] = None, | |||
**kwargs): | |||
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction | |||
@@ -23,16 +23,16 @@ class TextGenerationPipeline(Pipeline): | |||
model (SequenceClassificationModel): a model instance | |||
preprocessor (SequenceClassificationPreprocessor): a preprocessor instance | |||
""" | |||
sc_model = model if isinstance( | |||
model, | |||
PalmForTextGenerationModel) else Model.from_pretrained(model) | |||
model = model if isinstance( | |||
model, PalmForTextGeneration) else Model.from_pretrained(model) | |||
if preprocessor is None: | |||
preprocessor = TextGenerationPreprocessor( | |||
sc_model.model_dir, | |||
model.model_dir, | |||
model.tokenizer, | |||
first_sequence='sentence', | |||
second_sequence=None) | |||
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) | |||
self.tokenizer = preprocessor.tokenizer | |||
super().__init__(model=model, preprocessor=preprocessor, **kwargs) | |||
self.tokenizer = model.tokenizer | |||
def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]: | |||
"""process the prediction results | |||
@@ -43,17 +43,20 @@ class TextGenerationPipeline(Pipeline): | |||
Returns: | |||
Dict[str, str]: the prediction results | |||
""" | |||
replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''), | |||
('[unused1]', ''), (r' +', ' '), ('[SEP]', ''), | |||
('[unused2]', ''), ('[CLS]', ''), ('[UNK]', '')) | |||
replace_tokens_roberta = ((r' +', ' '), ('<mask>', '<q>'), ('<pad>', | |||
''), | |||
('<s>', ''), ('</s>', ''), ('<unk>', ' ')) | |||
vocab_size = len(self.tokenizer.vocab) | |||
pred_list = inputs['predictions'] | |||
pred_ids = pred_list[0][0].cpu().numpy().tolist() | |||
for j in range(len(pred_ids)): | |||
if pred_ids[j] >= vocab_size: | |||
pred_ids[j] = 100 | |||
pred = self.tokenizer.convert_ids_to_tokens(pred_ids) | |||
pred_string = ''.join(pred).replace( | |||
'##', | |||
'').split('[SEP]')[0].replace('[CLS]', | |||
'').replace('[SEP]', | |||
'').replace('[UNK]', '') | |||
pred_string = self.tokenizer.decode(pred_ids) | |||
for _old, _new in replace_tokens_bert: | |||
pred_string = pred_string.replace(_old, _new) | |||
pred_string.strip() | |||
for _old, _new in replace_tokens_roberta: | |||
pred_string = pred_string.replace(_old, _new) | |||
pred_string.strip() | |||
return {'text': pred_string} |
@@ -0,0 +1,69 @@ | |||
from typing import Any, Dict, Optional, Union | |||
from modelscope.models import Model | |||
from modelscope.models.nlp import StructBertForTokenClassification | |||
from modelscope.preprocessors import TokenClassifcationPreprocessor | |||
from modelscope.utils.constant import Tasks | |||
from ..base import Pipeline, Tensor | |||
from ..builder import PIPELINES | |||
__all__ = ['WordSegmentationPipeline'] | |||
@PIPELINES.register_module( | |||
Tasks.word_segmentation, | |||
module_name=r'structbert-chinese-word-segmentation') | |||
class WordSegmentationPipeline(Pipeline): | |||
def __init__(self, | |||
model: Union[StructBertForTokenClassification, str], | |||
preprocessor: Optional[TokenClassifcationPreprocessor] = None, | |||
**kwargs): | |||
"""use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction | |||
Args: | |||
model (StructBertForTokenClassification): a model instance | |||
preprocessor (TokenClassifcationPreprocessor): a preprocessor instance | |||
""" | |||
model = model if isinstance( | |||
model, | |||
StructBertForTokenClassification) else Model.from_pretrained(model) | |||
if preprocessor is None: | |||
preprocessor = TokenClassifcationPreprocessor(model.model_dir) | |||
super().__init__(model=model, preprocessor=preprocessor, **kwargs) | |||
self.tokenizer = preprocessor.tokenizer | |||
self.config = model.config | |||
self.id2label = self.config.id2label | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: | |||
"""process the prediction results | |||
Args: | |||
inputs (Dict[str, Any]): _description_ | |||
Returns: | |||
Dict[str, str]: the prediction results | |||
""" | |||
pred_list = inputs['predictions'] | |||
labels = [] | |||
for pre in pred_list: | |||
labels.append(self.id2label[pre]) | |||
labels = labels[1:-1] | |||
chunks = [] | |||
chunk = '' | |||
assert len(inputs['text']) == len(labels) | |||
for token, label in zip(inputs['text'], labels): | |||
if label[0] == 'B' or label[0] == 'I': | |||
chunk += token | |||
else: | |||
chunk += token | |||
chunks.append(chunk) | |||
chunk = '' | |||
if chunk: | |||
chunks.append(chunk) | |||
seg_result = ' '.join(chunks) | |||
rst = { | |||
'output': seg_result, | |||
} | |||
return rst |
@@ -54,6 +54,13 @@ TASK_OUTPUTS = { | |||
# } | |||
Tasks.pose_estimation: ['poses', 'boxes'], | |||
# ocr detection result for single sample | |||
# { | |||
# "det_polygons": np.array with shape [num_text, 8], each box is | |||
# [x1, y1, x2, y2, x3, y3, x4, y4] | |||
# } | |||
Tasks.ocr_detection: ['det_polygons'], | |||
# ============ nlp tasks =================== | |||
# text classification result for single sample | |||
@@ -75,8 +82,27 @@ TASK_OUTPUTS = { | |||
# } | |||
Tasks.fill_mask: ['text'], | |||
# word segmentation result for single sample | |||
# { | |||
# "output": "今天 天气 不错 , 适合 出去 游玩" | |||
# } | |||
Tasks.word_segmentation: ['output'], | |||
# sentence similarity result for single sample | |||
# { | |||
# "labels": "1", | |||
# "scores": 0.9 | |||
# } | |||
Tasks.sentence_similarity: ['scores', 'labels'], | |||
# ============ audio tasks =================== | |||
# audio processed for single file in PCM format | |||
# { | |||
# "output_pcm": np.array with shape(samples,) and dtype float32 | |||
# } | |||
Tasks.speech_signal_process: ['output_pcm'], | |||
# ============ multi-modal tasks =================== | |||
# image caption result for single sample | |||
@@ -1,7 +1,10 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
from .audio import LinearAECAndFbank | |||
from .base import Preprocessor | |||
from .builder import PREPROCESSORS, build_preprocessor | |||
from .common import Compose | |||
from .image import LoadImage, load_image | |||
from .multi_model import OfaImageCaptionPreprocessor | |||
from .nlp import * # noqa F403 | |||
from .text_to_speech import * # noqa F403 |
@@ -0,0 +1,231 @@ | |||
import ctypes | |||
import os | |||
from typing import Any, Dict | |||
import numpy as np | |||
import scipy.io.wavfile as wav | |||
import torch | |||
from numpy.ctypeslib import ndpointer | |||
from modelscope.utils.constant import Fields | |||
from .builder import PREPROCESSORS | |||
def load_wav(path): | |||
samp_rate, data = wav.read(path) | |||
return np.float32(data), samp_rate | |||
def load_library(libaec): | |||
libaec_in_cwd = os.path.join('.', libaec) | |||
if os.path.exists(libaec_in_cwd): | |||
libaec = libaec_in_cwd | |||
mitaec = ctypes.cdll.LoadLibrary(libaec) | |||
fe_process = mitaec.fe_process_inst | |||
fe_process.argtypes = [ | |||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), | |||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), ctypes.c_int, | |||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), | |||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS'), | |||
ndpointer(ctypes.c_float, flags='C_CONTIGUOUS') | |||
] | |||
return fe_process | |||
def do_linear_aec(fe_process, mic, ref, int16range=True): | |||
mic = np.float32(mic) | |||
ref = np.float32(ref) | |||
if len(mic) > len(ref): | |||
mic = mic[:len(ref)] | |||
out_mic = np.zeros_like(mic) | |||
out_linear = np.zeros_like(mic) | |||
out_echo = np.zeros_like(mic) | |||
out_ref = np.zeros_like(mic) | |||
if int16range: | |||
mic /= 32768 | |||
ref /= 32768 | |||
fe_process(mic, ref, len(mic), out_mic, out_linear, out_echo) | |||
# out_ref not in use here | |||
if int16range: | |||
out_mic *= 32768 | |||
out_linear *= 32768 | |||
out_echo *= 32768 | |||
return out_mic, out_ref, out_linear, out_echo | |||
def load_kaldi_feature_transform(filename): | |||
fp = open(filename, 'r') | |||
all_str = fp.read() | |||
pos1 = all_str.find('AddShift') | |||
pos2 = all_str.find('[', pos1) | |||
pos3 = all_str.find(']', pos2) | |||
mean = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ') | |||
pos1 = all_str.find('Rescale') | |||
pos2 = all_str.find('[', pos1) | |||
pos3 = all_str.find(']', pos2) | |||
scale = np.fromstring(all_str[pos2 + 1:pos3], dtype=np.float32, sep=' ') | |||
fp.close() | |||
return mean, scale | |||
class Feature: | |||
r"""Extract feat from one utterance. | |||
""" | |||
def __init__(self, | |||
fbank_config, | |||
feat_type='spec', | |||
mvn_file=None, | |||
cuda=False): | |||
r""" | |||
Args: | |||
fbank_config (dict): | |||
feat_type (str): | |||
raw: do nothing | |||
fbank: use kaldi.fbank | |||
spec: Real/Imag | |||
logpow: log(1+|x|^2) | |||
mvn_file (str): the path of data file for mean variance normalization | |||
cuda: | |||
""" | |||
self.fbank_config = fbank_config | |||
self.feat_type = feat_type | |||
self.n_fft = fbank_config['frame_length'] * fbank_config[ | |||
'sample_frequency'] // 1000 | |||
self.hop_length = fbank_config['frame_shift'] * fbank_config[ | |||
'sample_frequency'] // 1000 | |||
self.window = torch.hamming_window(self.n_fft, periodic=False) | |||
self.mvn = False | |||
if mvn_file is not None and os.path.exists(mvn_file): | |||
print(f'loading mvn file: {mvn_file}') | |||
shift, scale = load_kaldi_feature_transform(mvn_file) | |||
self.shift = torch.from_numpy(shift) | |||
self.scale = torch.from_numpy(scale) | |||
self.mvn = True | |||
if cuda: | |||
self.window = self.window.cuda() | |||
if self.mvn: | |||
self.shift = self.shift.cuda() | |||
self.scale = self.scale.cuda() | |||
def compute(self, utt): | |||
r""" | |||
Args: | |||
utt: in [-32768, 32767] range | |||
Returns: | |||
[..., T, F] | |||
""" | |||
if self.feat_type == 'raw': | |||
return utt | |||
elif self.feat_type == 'fbank': | |||
# have to use local import before modelscope framework supoort lazy loading | |||
import torchaudio.compliance.kaldi as kaldi | |||
if len(utt.shape) == 1: | |||
utt = utt.unsqueeze(0) | |||
feat = kaldi.fbank(utt, **self.fbank_config) | |||
elif self.feat_type == 'spec': | |||
spec = torch.stft( | |||
utt / 32768, | |||
self.n_fft, | |||
self.hop_length, | |||
self.n_fft, | |||
self.window, | |||
center=False, | |||
return_complex=True) | |||
feat = torch.cat([spec.real, spec.imag], dim=-2).permute(-1, -2) | |||
elif self.feat_type == 'logpow': | |||
spec = torch.stft( | |||
utt, | |||
self.n_fft, | |||
self.hop_length, | |||
self.n_fft, | |||
self.window, | |||
center=False, | |||
return_complex=True) | |||
abspow = torch.abs(spec)**2 | |||
feat = torch.log(1 + abspow).permute(-1, -2) | |||
return feat | |||
def normalize(self, feat): | |||
if self.mvn: | |||
feat = feat + self.shift | |||
feat = feat * self.scale | |||
return feat | |||
@PREPROCESSORS.register_module(Fields.audio) | |||
class LinearAECAndFbank: | |||
SAMPLE_RATE = 16000 | |||
def __init__(self, io_config): | |||
self.trunc_length = 7200 * self.SAMPLE_RATE | |||
self.linear_aec_delay = io_config['linear_aec_delay'] | |||
self.feature = Feature(io_config['fbank_config'], | |||
io_config['feat_type'], io_config['mvn']) | |||
self.mitaec = load_library(io_config['mitaec_library']) | |||
self.mask_on_mic = io_config['mask_on'] == 'nearend_mic' | |||
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: | |||
""" linear filtering the near end mic and far end audio, then extract the feature | |||
:param data: dict with two keys and correspond audios: "nearend_mic" and "farend_speech" | |||
:return: dict with two keys and Tensor values: "base" linear filtered audio,and "feature" | |||
""" | |||
# read files | |||
nearend_mic, fs = load_wav(data['nearend_mic']) | |||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' | |||
farend_speech, fs = load_wav(data['farend_speech']) | |||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' | |||
if 'nearend_speech' in data: | |||
nearend_speech, fs = load_wav(data['nearend_speech']) | |||
assert fs == self.SAMPLE_RATE, f'The sample rate should be {self.SAMPLE_RATE}' | |||
else: | |||
nearend_speech = np.zeros_like(nearend_mic) | |||
out_mic, out_ref, out_linear, out_echo = do_linear_aec( | |||
self.mitaec, nearend_mic, farend_speech) | |||
# fix 20ms linear aec delay by delaying the target speech | |||
extra_zeros = np.zeros([int(self.linear_aec_delay * fs)]) | |||
nearend_speech = np.concatenate([extra_zeros, nearend_speech]) | |||
# truncate files to the same length | |||
flen = min( | |||
len(out_mic), len(out_ref), len(out_linear), len(out_echo), | |||
len(nearend_speech)) | |||
fstart = 0 | |||
flen = min(flen, self.trunc_length) | |||
nearend_mic, out_ref, out_linear, out_echo, nearend_speech = ( | |||
out_mic[fstart:flen], out_ref[fstart:flen], | |||
out_linear[fstart:flen], out_echo[fstart:flen], | |||
nearend_speech[fstart:flen]) | |||
# extract features (frames, [mic, linear, ref, aes?]) | |||
feat = torch.FloatTensor() | |||
nearend_mic = torch.from_numpy(np.float32(nearend_mic)) | |||
fbank_nearend_mic = self.feature.compute(nearend_mic) | |||
feat = torch.cat([feat, fbank_nearend_mic], dim=1) | |||
out_linear = torch.from_numpy(np.float32(out_linear)) | |||
fbank_out_linear = self.feature.compute(out_linear) | |||
feat = torch.cat([feat, fbank_out_linear], dim=1) | |||
out_echo = torch.from_numpy(np.float32(out_echo)) | |||
fbank_out_echo = self.feature.compute(out_echo) | |||
feat = torch.cat([feat, fbank_out_echo], dim=1) | |||
# feature transform | |||
feat = self.feature.normalize(feat) | |||
# prepare target | |||
if nearend_speech is not None: | |||
nearend_speech = torch.from_numpy(np.float32(nearend_speech)) | |||
if self.mask_on_mic: | |||
base = nearend_mic | |||
else: | |||
base = out_linear | |||
out_data = {'base': base, 'target': nearend_speech, 'feature': feat} | |||
return out_data |
@@ -1,32 +1,50 @@ | |||
from typing import Any, Dict | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os.path as osp | |||
from typing import Any, Dict, Union | |||
import numpy as np | |||
import torch | |||
from maas_hub.snapshot_download import snapshot_download | |||
from PIL import Image | |||
from modelscope.pipelines.base import Input | |||
from modelscope.preprocessors import load_image | |||
from modelscope.utils.constant import Tasks | |||
from modelscope.utils.logger import get_logger | |||
from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from modelscope.utils.constant import Fields, ModelFile | |||
from modelscope.utils.hub import get_model_cache_dir | |||
from modelscope.utils.type_assert import type_assert | |||
from .base import Preprocessor | |||
from .builder import PREPROCESSORS | |||
from .image import load_image | |||
logger = get_logger() | |||
__all__ = [ | |||
'OfaImageCaptionPreprocessor', | |||
] | |||
@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa') | |||
class ImageCaptionPipeline(Pipeline): | |||
# TODO: refine using modelhub | |||
def __init__(self, model: str, bpe_dir: str): | |||
super().__init__() | |||
# turn on cuda if GPU is available | |||
@PREPROCESSORS.register_module( | |||
Fields.multi_modal, module_name=r'ofa-image-caption') | |||
class OfaImageCaptionPreprocessor(Preprocessor): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
"""preprocess the data via the vocab.txt from the `model_dir` path | |||
Args: | |||
model_dir (str): model path | |||
""" | |||
super().__init__(*args, **kwargs) | |||
if osp.exists(model_dir): | |||
local_model_dir = model_dir | |||
else: | |||
cache_path = get_model_cache_dir(model_dir) | |||
local_model_dir = cache_path if osp.exists( | |||
cache_path) else snapshot_download(model_dir) | |||
local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE) | |||
bpe_dir = local_model_dir | |||
from fairseq import checkpoint_utils, tasks, utils | |||
from ofa.tasks.mm_tasks import CaptionTask | |||
tasks.register_task('caption', CaptionTask) | |||
use_cuda = False | |||
# use fp16 only when GPU is available | |||
use_fp16 = False | |||
overrides = { | |||
'bpe_dir': bpe_dir, | |||
'eval_cider': False, | |||
@@ -35,21 +53,9 @@ class ImageCaptionPipeline(Pipeline): | |||
'no_repeat_ngram_size': 3, | |||
'seed': 7 | |||
} | |||
models, cfg, task = checkpoint_utils.load_model_ensemble_and_task( | |||
utils.split_paths(model), arg_overrides=overrides) | |||
# Move models to GPU | |||
for model in models: | |||
model.eval() | |||
if use_cuda: | |||
model.cuda() | |||
if use_fp16: | |||
model.half() | |||
model.prepare_for_inference_(cfg) | |||
self.models = models | |||
# Initialize generator | |||
self.generator = task.build_generator(models, cfg.generation) | |||
model, cfg, task = checkpoint_utils.load_model_ensemble_and_task( | |||
utils.split_paths(local_model), arg_overrides=overrides) | |||
del model | |||
# Initialize transform | |||
from torchvision import transforms | |||
mean = [0.5, 0.5, 0.5] | |||
@@ -69,7 +75,8 @@ class ImageCaptionPipeline(Pipeline): | |||
self.eos_item = torch.LongTensor([task.src_dict.eos()]) | |||
self.pad_idx = task.src_dict.pad() | |||
def preprocess(self, input: Input) -> Dict[str, Any]: | |||
@type_assert(object, (str, tuple)) | |||
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]: | |||
def encode_text(text, length=None, append_bos=False, append_eos=False): | |||
s = self.task.tgt_dict.encode_line( | |||
@@ -88,7 +95,7 @@ class ImageCaptionPipeline(Pipeline): | |||
patch_image = self.patch_resize_transform(input).unsqueeze(0) | |||
else: | |||
patch_image = self.patch_resize_transform( | |||
load_image(input)).unsqueeze(0) | |||
load_image(data)).unsqueeze(0) | |||
patch_mask = torch.tensor([True]) | |||
text = 'what does the image describe?' | |||
src_text = encode_text( | |||
@@ -105,17 +112,3 @@ class ImageCaptionPipeline(Pipeline): | |||
} | |||
} | |||
return sample | |||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
from ofa.utils.eval_utils import eval_caption | |||
results, _ = eval_caption(self.task, self.generator, self.models, | |||
input) | |||
return { | |||
'image_id': results[0]['image_id'], | |||
'caption': results[0]['caption'] | |||
} | |||
def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||
# What should we do here ? | |||
return inputs |
@@ -12,7 +12,8 @@ from .builder import PREPROCESSORS | |||
__all__ = [ | |||
'Tokenize', 'SequenceClassificationPreprocessor', | |||
'TextGenerationPreprocessor', 'FillMaskPreprocessor' | |||
'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor', | |||
'FillMaskPreprocessor' | |||
] | |||
@@ -53,12 +54,12 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir) | |||
print(f'this is the tokenzier {self.tokenizer}') | |||
@type_assert(object, (str, tuple)) | |||
def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]: | |||
@type_assert(object, (str, tuple, Dict)) | |||
def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: | |||
"""process the raw input data | |||
Args: | |||
data (str or tuple): | |||
data (str or tuple, Dict): | |||
sentence1 (str): a sentence | |||
Example: | |||
'you are so handsome.' | |||
@@ -70,22 +71,31 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
sentence2 (str): a sentence | |||
Example: | |||
'you are so beautiful.' | |||
or | |||
{field1: field_value1, field2: field_value2} | |||
field1 (str): field name, default 'first_sequence' | |||
field_value1 (str): a sentence | |||
Example: | |||
'you are so handsome.' | |||
field2 (str): field name, default 'second_sequence' | |||
field_value2 (str): a sentence | |||
Example: | |||
'you are so beautiful.' | |||
Returns: | |||
Dict[str, Any]: the preprocessed data | |||
""" | |||
if not isinstance(data, tuple): | |||
data = ( | |||
data, | |||
None, | |||
) | |||
sentence1, sentence2 = data | |||
new_data = { | |||
self.first_sequence: sentence1, | |||
self.second_sequence: sentence2 | |||
} | |||
if isinstance(data, str): | |||
new_data = {self.first_sequence: data} | |||
elif isinstance(data, tuple): | |||
sentence1, sentence2 = data | |||
new_data = { | |||
self.first_sequence: sentence1, | |||
self.second_sequence: sentence2 | |||
} | |||
else: | |||
new_data = data | |||
# preprocess the data for the model input | |||
@@ -115,17 +125,15 @@ class SequenceClassificationPreprocessor(Preprocessor): | |||
return rst | |||
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm') | |||
@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0') | |||
class TextGenerationPreprocessor(Preprocessor): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
def __init__(self, model_dir: str, tokenizer, *args, **kwargs): | |||
"""preprocess the data using the vocab.txt from the `model_dir` path | |||
Args: | |||
model_dir (str): model path | |||
""" | |||
from sofa import PalmTokenizer | |||
super().__init__(*args, **kwargs) | |||
self.model_dir: str = model_dir | |||
@@ -134,7 +142,7 @@ class TextGenerationPreprocessor(Preprocessor): | |||
self.second_sequence: str = kwargs.pop('second_sequence', | |||
'second_sequence') | |||
self.sequence_length: int = kwargs.pop('sequence_length', 128) | |||
self.tokenizer = PalmTokenizer.from_pretrained(model_dir) | |||
self.tokenizer = tokenizer | |||
@type_assert(object, str) | |||
def __call__(self, data: str) -> Dict[str, Any]: | |||
@@ -153,7 +161,7 @@ class TextGenerationPreprocessor(Preprocessor): | |||
new_data = {self.first_sequence: data} | |||
# preprocess the data for the model input | |||
rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []} | |||
rst = {'input_ids': [], 'attention_mask': []} | |||
max_seq_length = self.sequence_length | |||
@@ -225,3 +233,51 @@ class FillMaskPreprocessor(Preprocessor): | |||
rst['token_type_ids'].append(feature['token_type_ids']) | |||
return {k: torch.tensor(v) for k, v in rst.items()} | |||
@PREPROCESSORS.register_module( | |||
Fields.nlp, module_name=r'bert-token-classification') | |||
class TokenClassifcationPreprocessor(Preprocessor): | |||
def __init__(self, model_dir: str, *args, **kwargs): | |||
"""preprocess the data via the vocab.txt from the `model_dir` path | |||
Args: | |||
model_dir (str): model path | |||
""" | |||
super().__init__(*args, **kwargs) | |||
from sofa import SbertTokenizer | |||
self.model_dir: str = model_dir | |||
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) | |||
@type_assert(object, str) | |||
def __call__(self, data: str) -> Dict[str, Any]: | |||
"""process the raw input data | |||
Args: | |||
data (str): a sentence | |||
Example: | |||
'you are so handsome.' | |||
Returns: | |||
Dict[str, Any]: the preprocessed data | |||
""" | |||
# preprocess the data for the model input | |||
text = data.replace(' ', '').strip() | |||
tokens = [] | |||
for token in text: | |||
token = self.tokenizer.tokenize(token) | |||
tokens.extend(token) | |||
input_ids = self.tokenizer.convert_tokens_to_ids(tokens) | |||
input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids) | |||
attention_mask = [1] * len(input_ids) | |||
token_type_ids = [0] * len(input_ids) | |||
return { | |||
'text': text, | |||
'input_ids': input_ids, | |||
'attention_mask': attention_mask, | |||
'token_type_ids': token_type_ids | |||
} |
@@ -0,0 +1,51 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import io | |||
from typing import Any, Dict, Union | |||
from modelscope.fileio import File | |||
from modelscope.models.audio.tts.frontend import GenericTtsFrontend | |||
from modelscope.models.base import Model | |||
from modelscope.utils.audio.tts_exceptions import * # noqa F403 | |||
from modelscope.utils.constant import Fields | |||
from .base import Preprocessor | |||
from .builder import PREPROCESSORS | |||
__all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols'] | |||
@PREPROCESSORS.register_module( | |||
Fields.audio, module_name=r'text_to_tacotron_symbols') | |||
class TextToTacotronSymbols(Preprocessor): | |||
"""extract tacotron symbols from text. | |||
Args: | |||
res_path (str): TTS frontend resource url | |||
lang_type (str): language type, valid values are "pinyin" and "chenmix" | |||
""" | |||
def __init__(self, model_name, lang_type='pinyin'): | |||
self._frontend_model = Model.from_pretrained( | |||
model_name, lang_type=lang_type) | |||
assert self._frontend_model is not None, 'load model from pretained failed' | |||
def __call__(self, data: str) -> Dict[str, Any]: | |||
"""Call functions to load text and get tacotron symbols. | |||
Args: | |||
input (str): text with utf-8 | |||
Returns: | |||
symbos (list[str]): texts in tacotron symbols format. | |||
""" | |||
return self._frontend_model.forward(data) | |||
def text_to_tacotron_symbols(text='', path='./', lang='pinyin'): | |||
""" simple interface to transform text to tacotron symbols | |||
Args: | |||
text (str): input text | |||
path (str): resource path | |||
lang (str): language type from one of "pinyin" and "chenmix" | |||
""" | |||
transform = TextToTacotronSymbols(path, lang) | |||
return transform(text) |
@@ -0,0 +1,22 @@ | |||
import os | |||
from pathlib import Path | |||
# Cache location | |||
DEFAULT_CACHE_HOME = '~/.cache' | |||
CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME) | |||
DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub') | |||
MS_CACHE_HOME = os.path.expanduser( | |||
os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME)) | |||
DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets') | |||
MS_DATASETS_CACHE = Path( | |||
os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE)) | |||
DOWNLOADED_DATASETS_DIR = 'downloads' | |||
DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE, | |||
DOWNLOADED_DATASETS_DIR) | |||
DOWNLOADED_DATASETS_PATH = Path( | |||
os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH)) | |||
MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT', | |||
'http://101.201.119.157:31752') |
@@ -1,64 +1,81 @@ | |||
from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, | |||
Union) | |||
import os | |||
from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, | |||
Sequence, Union) | |||
from datasets import Dataset, load_dataset | |||
import numpy as np | |||
from datasets import Dataset | |||
from datasets import load_dataset as hf_load_dataset | |||
from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE | |||
from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES | |||
from datasets.utils.file_utils import (is_relative_path, | |||
relative_to_absolute_path) | |||
from modelscope.pydatasets.config import MS_DATASETS_CACHE | |||
from modelscope.pydatasets.utils.ms_api import MsApi | |||
from modelscope.utils.constant import Hubs | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
def format_list(para) -> List: | |||
if para is None: | |||
para = [] | |||
elif isinstance(para, str): | |||
para = [para] | |||
elif len(set(para)) < len(para): | |||
raise ValueError(f'List columns contains duplicates: {para}') | |||
return para | |||
class PyDataset: | |||
_hf_ds = None # holds the underlying HuggingFace Dataset | |||
"""A PyDataset backed by hugging face Dataset.""" | |||
def __init__(self, hf_ds: Dataset): | |||
def __init__(self, hf_ds: Dataset, target: Optional[str] = None): | |||
self._hf_ds = hf_ds | |||
self.target = None | |||
self.target = target | |||
def __iter__(self): | |||
if isinstance(self._hf_ds, Dataset): | |||
for item in self._hf_ds: | |||
if self.target is not None: | |||
yield item[self.target] | |||
else: | |||
yield item | |||
else: | |||
for ds in self._hf_ds.values(): | |||
for item in ds: | |||
if self.target is not None: | |||
yield item[self.target] | |||
else: | |||
yield item | |||
for item in self._hf_ds: | |||
if self.target is not None: | |||
yield item[self.target] | |||
else: | |||
yield item | |||
def __getitem__(self, key): | |||
return self._hf_ds[key] | |||
@classmethod | |||
def from_hf_dataset(cls, | |||
hf_ds: Dataset, | |||
target: str = None) -> 'PyDataset': | |||
dataset = cls(hf_ds) | |||
dataset.target = target | |||
return dataset | |||
target: str = None) -> Union[dict, 'PyDataset']: | |||
if isinstance(hf_ds, Dataset): | |||
return cls(hf_ds, target) | |||
if len(hf_ds.keys()) == 1: | |||
return cls(next(iter(hf_ds.values())), target) | |||
return {k: cls(v, target) for k, v in hf_ds.items()} | |||
@staticmethod | |||
def load(path: Union[str, list], | |||
target: Optional[str] = None, | |||
version: Optional[str] = None, | |||
name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, | |||
Union[str, | |||
Sequence[str]]]]] = None, | |||
hub: Optional[Hubs] = None) -> 'PyDataset': | |||
def load( | |||
dataset_name: Union[str, list], | |||
target: Optional[str] = None, | |||
version: Optional[str] = None, | |||
hub: Optional[Hubs] = Hubs.modelscope, | |||
subset_name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None | |||
) -> Union[dict, 'PyDataset']: | |||
"""Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. | |||
Args: | |||
path (str): Path or name of the dataset. | |||
dataset_name (str): Path or name of the dataset. | |||
target (str, optional): Name of the column to output. | |||
version (str, optional): Version of the dataset script to load: | |||
name (str, optional): Defining the subset_name of the dataset. | |||
subset_name (str, optional): Defining the subset_name of the dataset. | |||
data_dir (str, optional): Defining the data_dir of the dataset configuration. I | |||
data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s). | |||
split (str, optional): Which split of the data to load. | |||
@@ -67,53 +84,302 @@ class PyDataset: | |||
Returns: | |||
PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset. | |||
""" | |||
if Hubs.modelscope == hub: | |||
# TODO: parse data meta information from modelscope hub | |||
# and possibly download data files to local (and update path) | |||
print('getting data from modelscope hub') | |||
if isinstance(path, str): | |||
dataset = load_dataset( | |||
path, | |||
name=name, | |||
if hub == Hubs.huggingface: | |||
dataset = hf_load_dataset( | |||
dataset_name, | |||
name=subset_name, | |||
revision=version, | |||
split=split, | |||
data_dir=data_dir, | |||
data_files=data_files) | |||
elif isinstance(path, list): | |||
return PyDataset.from_hf_dataset(dataset, target=target) | |||
else: | |||
return PyDataset._load_ms_dataset( | |||
dataset_name, | |||
target=target, | |||
subset_name=subset_name, | |||
version=version, | |||
split=split, | |||
data_dir=data_dir, | |||
data_files=data_files) | |||
@staticmethod | |||
def _load_ms_dataset( | |||
dataset_name: Union[str, list], | |||
target: Optional[str] = None, | |||
version: Optional[str] = None, | |||
subset_name: Optional[str] = None, | |||
split: Optional[str] = None, | |||
data_dir: Optional[str] = None, | |||
data_files: Optional[Union[str, Sequence[str], | |||
Mapping[str, Union[str, | |||
Sequence[str]]]]] = None | |||
) -> Union[dict, 'PyDataset']: | |||
if isinstance(dataset_name, str): | |||
use_hf = False | |||
if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \ | |||
(os.path.isfile(dataset_name) and dataset_name.endswith('.py')): | |||
use_hf = True | |||
elif is_relative_path(dataset_name): | |||
ms_api = MsApi() | |||
dataset_scripts = ms_api.fetch_dataset_scripts( | |||
dataset_name, version) | |||
if 'py' in dataset_scripts: # dataset copied from hf datasets | |||
dataset_name = dataset_scripts['py'][0] | |||
use_hf = True | |||
else: | |||
raise FileNotFoundError( | |||
f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " | |||
f'or any data file in the same directory.') | |||
if use_hf: | |||
dataset = hf_load_dataset( | |||
dataset_name, | |||
name=subset_name, | |||
revision=version, | |||
split=split, | |||
data_dir=data_dir, | |||
data_files=data_files, | |||
cache_dir=MS_DATASETS_CACHE) | |||
else: | |||
# TODO load from ms datahub | |||
raise NotImplementedError( | |||
f'Dataset {dataset_name} load from modelscope datahub to be implemented in ' | |||
f'the future') | |||
elif isinstance(dataset_name, list): | |||
if target is None: | |||
target = 'target' | |||
dataset = Dataset.from_dict({target: [p] for p in path}) | |||
dataset = Dataset.from_dict({target: dataset_name}) | |||
else: | |||
raise TypeError('path must be a str or a list, but got' | |||
f' {type(path)}') | |||
f' {type(dataset_name)}') | |||
return PyDataset.from_hf_dataset(dataset, target=target) | |||
def to_torch_dataset_with_processors( | |||
self, | |||
preprocessors: Union[Callable, List[Callable]], | |||
columns: Union[str, List[str]] = None, | |||
): | |||
preprocessor_list = preprocessors if isinstance( | |||
preprocessors, list) else [preprocessors] | |||
columns = format_list(columns) | |||
columns = [ | |||
key for key in self._hf_ds.features.keys() if key in columns | |||
] | |||
sample = next(iter(self._hf_ds)) | |||
sample_res = {k: np.array(sample[k]) for k in columns} | |||
for processor in preprocessor_list: | |||
sample_res.update( | |||
{k: np.array(v) | |||
for k, v in processor(sample).items()}) | |||
def is_numpy_number(value): | |||
return np.issubdtype(value.dtype, np.integer) or np.issubdtype( | |||
value.dtype, np.floating) | |||
retained_columns = [] | |||
for k in sample_res.keys(): | |||
if not is_numpy_number(sample_res[k]): | |||
logger.warning( | |||
f'Data of column {k} is non-numeric, will be removed') | |||
continue | |||
retained_columns.append(k) | |||
import torch | |||
class MsIterableDataset(torch.utils.data.IterableDataset): | |||
def __init__(self, dataset: Iterable): | |||
super(MsIterableDataset).__init__() | |||
self.dataset = dataset | |||
def __iter__(self): | |||
for item_dict in self.dataset: | |||
res = { | |||
k: np.array(item_dict[k]) | |||
for k in columns if k in retained_columns | |||
} | |||
for preprocessor in preprocessor_list: | |||
res.update({ | |||
k: np.array(v) | |||
for k, v in preprocessor(item_dict).items() | |||
if k in retained_columns | |||
}) | |||
yield res | |||
return MsIterableDataset(self._hf_ds) | |||
def to_torch_dataset( | |||
self, | |||
columns: Union[str, List[str]] = None, | |||
output_all_columns: bool = False, | |||
preprocessors: Union[Callable, List[Callable]] = None, | |||
**format_kwargs, | |||
): | |||
self._hf_ds.reset_format() | |||
self._hf_ds.set_format( | |||
type='torch', | |||
columns=columns, | |||
output_all_columns=output_all_columns, | |||
format_kwargs=format_kwargs) | |||
return self._hf_ds | |||
"""Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to | |||
torch.utils.data.DataLoader. | |||
Args: | |||
preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process | |||
every sample of the dataset. The output type of processors is dict, and each numeric field of the dict | |||
will be used as a field of torch.utils.data.Dataset. | |||
columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the | |||
preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None, | |||
the output fields of processors will also be added. | |||
format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`. | |||
Returns: | |||
:class:`tf.data.Dataset` | |||
""" | |||
if not TORCH_AVAILABLE: | |||
raise ImportError( | |||
'The function to_torch_dataset requires pytorch to be installed' | |||
) | |||
if preprocessors is not None: | |||
return self.to_torch_dataset_with_processors(preprocessors) | |||
else: | |||
self._hf_ds.reset_format() | |||
self._hf_ds.set_format( | |||
type='torch', columns=columns, format_kwargs=format_kwargs) | |||
return self._hf_ds | |||
def to_tf_dataset_with_processors( | |||
self, | |||
batch_size: int, | |||
shuffle: bool, | |||
preprocessors: Union[Callable, List[Callable]], | |||
drop_remainder: bool = None, | |||
prefetch: bool = True, | |||
label_cols: Union[str, List[str]] = None, | |||
columns: Union[str, List[str]] = None, | |||
): | |||
preprocessor_list = preprocessors if isinstance( | |||
preprocessors, list) else [preprocessors] | |||
label_cols = format_list(label_cols) | |||
columns = format_list(columns) | |||
cols_to_retain = list(set(label_cols + columns)) | |||
retained_columns = [ | |||
key for key in self._hf_ds.features.keys() if key in cols_to_retain | |||
] | |||
import tensorflow as tf | |||
tf_dataset = tf.data.Dataset.from_tensor_slices( | |||
np.arange(len(self._hf_ds), dtype=np.int64)) | |||
if shuffle: | |||
tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds)) | |||
def func(i, return_dict=False): | |||
i = int(i) | |||
res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns} | |||
for preprocessor in preprocessor_list: | |||
# TODO preprocessor output may have the same key | |||
res.update({ | |||
k: np.array(v) | |||
for k, v in preprocessor(self._hf_ds[i]).items() | |||
}) | |||
if return_dict: | |||
return res | |||
return tuple(list(res.values())) | |||
sample_res = func(0, True) | |||
@tf.function(input_signature=[tf.TensorSpec(None, tf.int64)]) | |||
def fetch_function(i): | |||
output = tf.numpy_function( | |||
func, | |||
inp=[i], | |||
Tout=[ | |||
tf.dtypes.as_dtype(val.dtype) | |||
for val in sample_res.values() | |||
], | |||
) | |||
return {key: output[i] for i, key in enumerate(sample_res)} | |||
tf_dataset = tf_dataset.map( | |||
fetch_function, num_parallel_calls=tf.data.AUTOTUNE) | |||
if label_cols: | |||
def split_features_and_labels(input_batch): | |||
labels = { | |||
key: tensor | |||
for key, tensor in input_batch.items() if key in label_cols | |||
} | |||
if len(input_batch) == 1: | |||
input_batch = next(iter(input_batch.values())) | |||
if len(labels) == 1: | |||
labels = next(iter(labels.values())) | |||
return input_batch, labels | |||
tf_dataset = tf_dataset.map(split_features_and_labels) | |||
elif len(columns) == 1: | |||
tf_dataset = tf_dataset.map(lambda x: next(iter(x.values()))) | |||
if batch_size > 1: | |||
tf_dataset = tf_dataset.batch( | |||
batch_size, drop_remainder=drop_remainder) | |||
if prefetch: | |||
tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE) | |||
return tf_dataset | |||
def to_tf_dataset( | |||
self, | |||
columns: Union[str, List[str]], | |||
batch_size: int, | |||
shuffle: bool, | |||
collate_fn: Callable, | |||
preprocessors: Union[Callable, List[Callable]] = None, | |||
columns: Union[str, List[str]] = None, | |||
collate_fn: Callable = None, | |||
drop_remainder: bool = None, | |||
collate_fn_args: Dict[str, Any] = None, | |||
label_cols: Union[str, List[str]] = None, | |||
dummy_labels: bool = False, | |||
prefetch: bool = True, | |||
): | |||
"""Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like | |||
model.fit() or model.predict(). | |||
Args: | |||
batch_size (int): Number of samples in a single batch. | |||
shuffle(bool): Shuffle the dataset order. | |||
preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process | |||
every sample of the dataset. The output type of processors is dict, and each field of the dict will be | |||
used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn` | |||
shouldn't be None. | |||
columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None, | |||
the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of | |||
processors will also be added. | |||
collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If | |||
the `preprocessors` is None, the `collate_fn` shouldn't be None. | |||
drop_remainder(bool, default None): Drop the last incomplete batch when loading. | |||
collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`. | |||
label_cols (str or List[str], defalut None): Dataset column(s) to load as labels. | |||
prefetch (bool, default True): Prefetch data. | |||
Returns: | |||
:class:`tf.data.Dataset` | |||
""" | |||
if not TF_AVAILABLE: | |||
raise ImportError( | |||
'The function to_tf_dataset requires Tensorflow to be installed.' | |||
) | |||
if preprocessors is not None: | |||
return self.to_tf_dataset_with_processors( | |||
batch_size, | |||
shuffle, | |||
preprocessors, | |||
drop_remainder=drop_remainder, | |||
prefetch=prefetch, | |||
label_cols=label_cols, | |||
columns=columns) | |||
if collate_fn is None: | |||
logger.error( | |||
'The `preprocessors` and the `collate_fn` should`t be both None.' | |||
) | |||
return None | |||
self._hf_ds.reset_format() | |||
return self._hf_ds.to_tf_dataset( | |||
columns, | |||
@@ -123,7 +389,6 @@ class PyDataset: | |||
drop_remainder=drop_remainder, | |||
collate_fn_args=collate_fn_args, | |||
label_cols=label_cols, | |||
dummy_labels=dummy_labels, | |||
prefetch=prefetch) | |||
def to_hf_dataset(self) -> Dataset: | |||
@@ -0,0 +1,66 @@ | |||
import os | |||
from collections import defaultdict | |||
from typing import Optional | |||
import requests | |||
from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH, | |||
MS_HUB_ENDPOINT) | |||
from modelscope.utils.logger import get_logger | |||
logger = get_logger() | |||
class MsApi: | |||
def __init__(self, endpoint=MS_HUB_ENDPOINT): | |||
self.endpoint = endpoint | |||
def list_datasets(self): | |||
path = f'{self.endpoint}/api/v1/datasets' | |||
headers = None | |||
params = {} | |||
r = requests.get(path, params=params, headers=headers) | |||
r.raise_for_status() | |||
dataset_list = r.json()['Data'] | |||
return [x['Name'] for x in dataset_list] | |||
def fetch_dataset_scripts(self, | |||
dataset_name: str, | |||
version: Optional[str] = 'master', | |||
force_download=False): | |||
datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}' | |||
r = requests.get(datahub_url) | |||
r.raise_for_status() | |||
dataset_list = r.json()['Data'] | |||
if len(dataset_list) == 0: | |||
return None | |||
dataset_id = dataset_list[0]['Id'] | |||
version = version or 'master' | |||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}' | |||
r = requests.get(datahub_url) | |||
r.raise_for_status() | |||
file_list = r.json()['Data']['Files'] | |||
cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name, | |||
version) | |||
os.makedirs(cache_dir, exist_ok=True) | |||
local_paths = defaultdict(list) | |||
for file_info in file_list: | |||
file_path = file_info['Path'] | |||
if file_path.endswith('.py'): | |||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \ | |||
f'Revision={version}&Path={file_path}' | |||
r = requests.get(datahub_url) | |||
r.raise_for_status() | |||
content = r.json()['Data']['Content'] | |||
local_path = os.path.join(cache_dir, file_path) | |||
if os.path.exists(local_path) and not force_download: | |||
logger.warning( | |||
f"Reusing dataset {dataset_name}'s python file ({local_path})" | |||
) | |||
local_paths['py'].append(local_path) | |||
continue | |||
with open(local_path, 'w') as f: | |||
f.writelines(content) | |||
local_paths['py'].append(local_path) | |||
return local_paths |
@@ -0,0 +1,42 @@ | |||
""" | |||
Define TTS exceptions | |||
""" | |||
class TtsException(Exception): | |||
""" | |||
TTS exception class. | |||
""" | |||
pass | |||
class TtsFrontendException(TtsException): | |||
""" | |||
TTS frontend module level exceptions. | |||
""" | |||
pass | |||
class TtsFrontendInitializeFailedException(TtsFrontendException): | |||
""" | |||
If tts frontend resource is invalid or not exist, this exception will be raised. | |||
""" | |||
pass | |||
class TtsFrontendLanguageTypeInvalidException(TtsFrontendException): | |||
""" | |||
If language type is invalid, this exception will be raised. | |||
""" | |||
class TtsVocoderException(TtsException): | |||
""" | |||
Vocoder exception | |||
""" | |||
class TtsVocoderMelspecShapeMismatchException(TtsVocoderException): | |||
""" | |||
If vocoder's input melspec shape mismatch, this exception will be raised. | |||
""" |
@@ -28,8 +28,10 @@ class Tasks(object): | |||
image_editing = 'image-editing' | |||
image_generation = 'image-generation' | |||
image_matting = 'image-matting' | |||
ocr_detection = 'ocr-detection' | |||
# nlp tasks | |||
word_segmentation = 'word-segmentation' | |||
sentiment_analysis = 'sentiment-analysis' | |||
sentence_similarity = 'sentence-similarity' | |||
text_classification = 'text-classification' | |||
@@ -67,7 +67,6 @@ class Registry(object): | |||
if module_name in self._modules[group_key]: | |||
raise KeyError(f'{module_name} is already registered in ' | |||
f'{self._name}[{group_key}]') | |||
self._modules[group_key][module_name] = module_cls | |||
module_cls.group_key = group_key | |||
@@ -2,6 +2,9 @@ | |||
# Copyright (c) Alibaba, Inc. and its affiliates. | |||
import os | |||
import unittest | |||
from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE | |||
TEST_LEVEL = 2 | |||
TEST_LEVEL_STR = 'TEST_LEVEL' | |||
@@ -15,6 +18,18 @@ def test_level(): | |||
return TEST_LEVEL | |||
def require_tf(test_case): | |||
if not TF_AVAILABLE: | |||
test_case = unittest.skip('test requires TensorFlow')(test_case) | |||
return test_case | |||
def require_torch(test_case): | |||
if not TORCH_AVAILABLE: | |||
test_case = unittest.skip('test requires PyTorch')(test_case) | |||
return test_case | |||
def set_test_level(level: int): | |||
global TEST_LEVEL | |||
TEST_LEVEL = level |
@@ -2,4 +2,5 @@ | |||
-r requirements/pipeline.txt | |||
-r requirements/multi-modal.txt | |||
-r requirements/nlp.txt | |||
-r requirements/audio.txt | |||
-r requirements/cv.txt |
@@ -0,0 +1,26 @@ | |||
#tts | |||
h5py==2.10.0 | |||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl | |||
https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl | |||
https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D | |||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl | |||
#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl | |||
inflect | |||
keras==2.2.4 | |||
librosa | |||
lxml | |||
matplotlib | |||
nara_wpe | |||
numpy==1.18.* | |||
protobuf==3.20.* | |||
ptflops | |||
PyWavelets>=1.0.0 | |||
scikit-learn==0.23.2 | |||
sox | |||
tensorboard | |||
tensorflow==1.15.* | |||
torch==1.10.* | |||
torchaudio | |||
torchvision | |||
tqdm | |||
unidecode |
@@ -1 +1,2 @@ | |||
easydict | |||
tf_slim |
@@ -1,12 +1,13 @@ | |||
addict | |||
datasets | |||
easydict | |||
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.2.dev0-py3-none-any.whl | |||
https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl | |||
numpy | |||
opencv-python-headless | |||
Pillow>=6.2.0 | |||
pyyaml | |||
requests | |||
scipy | |||
tokenizers<=0.10.3 | |||
transformers<=4.16.2 | |||
yapf |
@@ -11,6 +11,7 @@ default_section = THIRDPARTY | |||
BASED_ON_STYLE = pep8 | |||
BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true | |||
SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true | |||
SPLIT_BEFORE_ARITHMETIC_OPERATOR = true | |||
[codespell] | |||
skip = *.ipynb | |||
@@ -20,5 +21,5 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids | |||
[flake8] | |||
select = B,C,E,F,P,T4,W,B9 | |||
max-line-length = 120 | |||
ignore = F401,F821 | |||
ignore = F401,F821,W503 | |||
exclude = docs/src,*.pyi,.git |
@@ -80,8 +80,7 @@ class CustomPipelineTest(unittest.TestCase): | |||
pipe2 = pipeline(dummy_task) | |||
self.assertTrue(type(pipe) is type(pipe2)) | |||
img_url = 'http://pai-vision-data-hz.oss-cn-zhangjiakou.' \ | |||
'aliyuncs.com/data/test/images/image1.jpg' | |||
img_url = 'data/test/images/image1.jpg' | |||
output = pipe(img_url) | |||
self.assertEqual(output['filename'], img_url) | |||
self.assertEqual(output['output_png'].shape, (318, 512, 3)) | |||