Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9409213master
@@ -1,8 +1,5 @@ | |||||
from typing import Dict | from typing import Dict | ||||
import numpy as np | |||||
from rouge_score import rouge_scorer | |||||
from ..metainfo import Metrics | from ..metainfo import Metrics | ||||
from ..utils.registry import default_group | from ..utils.registry import default_group | ||||
from .base import Metric | from .base import Metric | ||||
@@ -18,6 +15,7 @@ class TextGenerationMetric(Metric): | |||||
def __init__(self): | def __init__(self): | ||||
self.preds = [] | self.preds = [] | ||||
self.tgts = [] | self.tgts = [] | ||||
from rouge_score import rouge_scorer | |||||
self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | ||||
def add(self, outputs: Dict, inputs: Dict): | def add(self, outputs: Dict, inputs: Dict): | ||||
@@ -1,7 +1,4 @@ | |||||
import tensorflow as tf | import tensorflow as tf | ||||
from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||||
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||||
from tensorflow.contrib.rnn import LSTMBlockCell | |||||
def encoder_prenet(inputs, | def encoder_prenet(inputs, | ||||
@@ -207,6 +204,7 @@ def conv_and_lstm(inputs, | |||||
embedded_inputs_speaker, | embedded_inputs_speaker, | ||||
mask=None, | mask=None, | ||||
scope='conv_and_lstm'): | scope='conv_and_lstm'): | ||||
from tensorflow.contrib.rnn import LSTMBlockCell | |||||
x = inputs | x = inputs | ||||
with tf.variable_scope(scope): | with tf.variable_scope(scope): | ||||
for i in range(n_conv_layers): | for i in range(n_conv_layers): | ||||
@@ -244,6 +242,7 @@ def conv_and_lstm_dec(inputs, | |||||
mask=None, | mask=None, | ||||
scope='conv_and_lstm'): | scope='conv_and_lstm'): | ||||
x = inputs | x = inputs | ||||
from tensorflow.contrib.rnn import LSTMBlockCell | |||||
with tf.variable_scope(scope): | with tf.variable_scope(scope): | ||||
for i in range(n_conv_layers): | for i in range(n_conv_layers): | ||||
x = conv1d( | x = conv1d( | ||||
@@ -1,9 +1,8 @@ | |||||
import numpy as np | import numpy as np | ||||
import tensorflow as tf | import tensorflow as tf | ||||
from tensorflow.contrib.seq2seq import Helper | |||||
class VarTestHelper(Helper): | |||||
class VarTestHelper(tf.contrib.seq2seq.Helper): | |||||
def __init__(self, batch_size, inputs, dim): | def __init__(self, batch_size, inputs, dim): | ||||
with tf.name_scope('VarTestHelper'): | with tf.name_scope('VarTestHelper'): | ||||
@@ -44,7 +43,7 @@ class VarTestHelper(Helper): | |||||
return (finished, next_inputs, state) | return (finished, next_inputs, state) | ||||
class VarTrainingHelper(Helper): | |||||
class VarTrainingHelper(tf.contrib.seq2seq.Helper): | |||||
def __init__(self, targets, inputs, dim): | def __init__(self, targets, inputs, dim): | ||||
with tf.name_scope('VarTrainingHelper'): | with tf.name_scope('VarTrainingHelper'): | ||||
@@ -86,7 +85,7 @@ class VarTrainingHelper(Helper): | |||||
return (finished, next_inputs, state) | return (finished, next_inputs, state) | ||||
class VarTrainingSSHelper(Helper): | |||||
class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): | |||||
def __init__(self, targets, inputs, dim, global_step, schedule_begin, | def __init__(self, targets, inputs, dim, global_step, schedule_begin, | ||||
alpha, decay_steps): | alpha, decay_steps): | ||||
@@ -1,14 +1,11 @@ | |||||
import numpy as np | |||||
import tensorflow as tf | import tensorflow as tf | ||||
from tensorflow.contrib.rnn import RNNCell | |||||
from tensorflow.contrib.seq2seq import AttentionWrapperState | |||||
from tensorflow.python.ops import rnn_cell_impl | from tensorflow.python.ops import rnn_cell_impl | ||||
from .am_models import prenet | from .am_models import prenet | ||||
class VarPredictorCell(RNNCell): | |||||
'''Wrapper wrapper knock knock.''' | |||||
class VarPredictorCell(tf.contrib.rnn.RNNCell): | |||||
"""Wrapper wrapper knock knock.""" | |||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | ||||
super(VarPredictorCell, self).__init__() | super(VarPredictorCell, self).__init__() | ||||
@@ -33,7 +30,7 @@ class VarPredictorCell(RNNCell): | |||||
]) | ]) | ||||
def call(self, inputs, state): | def call(self, inputs, state): | ||||
'''Run the Tacotron2 super decoder cell.''' | |||||
"""Run the Tacotron2 super decoder cell.""" | |||||
super_cell_out, decoder_state = state | super_cell_out, decoder_state = state | ||||
# split | # split | ||||
@@ -61,8 +58,8 @@ class VarPredictorCell(RNNCell): | |||||
return new_super_cell_out, new_states | return new_super_cell_out, new_states | ||||
class DurPredictorCell(RNNCell): | |||||
'''Wrapper wrapper knock knock.''' | |||||
class DurPredictorCell(tf.contrib.rnn.RNNCell): | |||||
"""Wrapper wrapper knock knock.""" | |||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | ||||
super(DurPredictorCell, self).__init__() | super(DurPredictorCell, self).__init__() | ||||
@@ -87,7 +84,7 @@ class DurPredictorCell(RNNCell): | |||||
]) | ]) | ||||
def call(self, inputs, state): | def call(self, inputs, state): | ||||
'''Run the Tacotron2 super decoder cell.''' | |||||
"""Run the Tacotron2 super decoder cell.""" | |||||
super_cell_out, decoder_state = state | super_cell_out, decoder_state = state | ||||
# split | # split | ||||
@@ -117,8 +114,8 @@ class DurPredictorCell(RNNCell): | |||||
return new_super_cell_out, new_states | return new_super_cell_out, new_states | ||||
class DurPredictorCECell(RNNCell): | |||||
'''Wrapper wrapper knock knock.''' | |||||
class DurPredictorCECell(tf.contrib.rnn.RNNCell): | |||||
"""Wrapper wrapper knock knock.""" | |||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | ||||
max_dur, dur_embedding_dim): | max_dur, dur_embedding_dim): | ||||
@@ -146,7 +143,7 @@ class DurPredictorCECell(RNNCell): | |||||
]) | ]) | ||||
def call(self, inputs, state): | def call(self, inputs, state): | ||||
'''Run the Tacotron2 super decoder cell.''' | |||||
"""Run the Tacotron2 super decoder cell.""" | |||||
super_cell_out, decoder_state = state | super_cell_out, decoder_state = state | ||||
# split | # split | ||||
@@ -181,8 +178,8 @@ class DurPredictorCECell(RNNCell): | |||||
return new_super_cell_out, new_states | return new_super_cell_out, new_states | ||||
class VarPredictorCell2(RNNCell): | |||||
'''Wrapper wrapper knock knock.''' | |||||
class VarPredictorCell2(tf.contrib.rnn.RNNCell): | |||||
"""Wrapper wrapper knock knock.""" | |||||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | ||||
super(VarPredictorCell2, self).__init__() | super(VarPredictorCell2, self).__init__() | ||||
@@ -1,14 +1,8 @@ | |||||
import tensorflow as tf | import tensorflow as tf | ||||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||||
from tensorflow.contrib.seq2seq import BasicDecoder | |||||
from tensorflow.python.ops.ragged.ragged_util import repeat | from tensorflow.python.ops.ragged.ragged_util import repeat | ||||
from .am_models import conv_prenet, decoder_prenet, encoder_prenet | |||||
from .fsmn_encoder import FsmnEncoderV2 | from .fsmn_encoder import FsmnEncoderV2 | ||||
from .helpers import VarTestHelper, VarTrainingHelper | |||||
from .position import (BatchSinusodalPositionalEncoding, | |||||
SinusodalPositionalEncoding) | |||||
from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||||
from .position import BatchSinusodalPositionalEncoding | |||||
from .self_attention_decoder import SelfAttentionDecoder | from .self_attention_decoder import SelfAttentionDecoder | ||||
from .self_attention_encoder import SelfAttentionEncoder | from .self_attention_encoder import SelfAttentionEncoder | ||||
@@ -32,7 +26,7 @@ class RobuTrans(): | |||||
duration_scales=None, | duration_scales=None, | ||||
energy_contours=None, | energy_contours=None, | ||||
energy_scales=None): | energy_scales=None): | ||||
'''Initializes the model for inference. | |||||
"""Initializes the model for inference. | |||||
Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | ||||
@@ -46,7 +40,10 @@ class RobuTrans(): | |||||
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | ||||
of steps in the output time series, M is num_mels, and values are entries in the mel | of steps in the output time series, M is num_mels, and values are entries in the mel | ||||
spectrogram. Only needed for training. | spectrogram. Only needed for training. | ||||
''' | |||||
""" | |||||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||||
from tensorflow.contrib.seq2seq import BasicDecoder | |||||
with tf.variable_scope('inference') as _: | with tf.variable_scope('inference') as _: | ||||
is_training = mel_targets is not None | is_training = mel_targets is not None | ||||
batch_size = tf.shape(inputs)[0] | batch_size = tf.shape(inputs)[0] | ||||
@@ -229,17 +226,20 @@ class RobuTrans(): | |||||
LSTMBlockCell(hp.predictor_lstm_units), | LSTMBlockCell(hp.predictor_lstm_units), | ||||
LSTMBlockCell(hp.predictor_lstm_units) | LSTMBlockCell(hp.predictor_lstm_units) | ||||
], state_is_tuple=True) # yapf:disable | ], state_is_tuple=True) # yapf:disable | ||||
from .rnn_wrappers import DurPredictorCell | |||||
duration_output_cell = DurPredictorCell( | duration_output_cell = DurPredictorCell( | ||||
duration_predictor_cell, is_training, 1, | duration_predictor_cell, is_training, 1, | ||||
hp.predictor_prenet_units) | hp.predictor_prenet_units) | ||||
duration_predictor_init_state = duration_output_cell.zero_state( | duration_predictor_init_state = duration_output_cell.zero_state( | ||||
batch_size=batch_size, dtype=tf.float32) | batch_size=batch_size, dtype=tf.float32) | ||||
if is_training: | if is_training: | ||||
from .helpers import VarTrainingHelper | |||||
duration_helper = VarTrainingHelper( | duration_helper = VarTrainingHelper( | ||||
tf.expand_dims( | tf.expand_dims( | ||||
tf.log(tf.cast(durations, tf.float32) + 1), | tf.log(tf.cast(durations, tf.float32) + 1), | ||||
axis=2), dur_inputs, 1) | axis=2), dur_inputs, 1) | ||||
else: | else: | ||||
from .helpers import VarTestHelper | |||||
duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | ||||
( | ( | ||||
duration_outputs, _ | duration_outputs, _ | ||||
@@ -1,14 +1,10 @@ | |||||
from __future__ import (absolute_import, division, print_function, | from __future__ import (absolute_import, division, print_function, | ||||
unicode_literals) | unicode_literals) | ||||
import io | |||||
import os | import os | ||||
import time | |||||
import zipfile | import zipfile | ||||
from typing import Any, Dict, Optional, Union | |||||
import json | import json | ||||
import numpy as np | import numpy as np | ||||
import torch | |||||
from modelscope.metainfo import Models | from modelscope.metainfo import Models | ||||
from modelscope.models.base import Model | from modelscope.models.base import Model | ||||
@@ -16,8 +12,8 @@ from modelscope.models.builder import MODELS | |||||
from modelscope.utils.audio.tts_exceptions import ( | from modelscope.utils.audio.tts_exceptions import ( | ||||
TtsFrontendInitializeFailedException, | TtsFrontendInitializeFailedException, | ||||
TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | ||||
TtsVocoderMelspecShapeMismatchException, TtsVoiceNotExistsException) | |||||
from modelscope.utils.constant import ModelFile, Tasks | |||||
TtsVoiceNotExistsException) | |||||
from modelscope.utils.constant import Tasks | |||||
from .voice import Voice | from .voice import Voice | ||||
import tensorflow as tf # isort:skip | import tensorflow as tf # isort:skip | ||||
@@ -23,8 +23,8 @@ logger = get_logger() | |||||
class Pipeline(ABC): | class Pipeline(ABC): | ||||
def initiate_single_model(self, model): | def initiate_single_model(self, model): | ||||
logger.info(f'initiate model from {model}') | |||||
if isinstance(model, str) and is_official_hub_path(model): | if isinstance(model, str) and is_official_hub_path(model): | ||||
logger.info(f'initiate model from location {model}.') | |||||
# expecting model has been prefetched to local cache beforehand | # expecting model has been prefetched to local cache beforehand | ||||
return Model.from_pretrained( | return Model.from_pretrained( | ||||
model, model_prefetched=True) if is_model(model) else model | model, model_prefetched=True) if is_model(model) else model | ||||
@@ -1,11 +1,9 @@ | |||||
import os.path as osp | import os.path as osp | ||||
from typing import Any, Dict | from typing import Any, Dict | ||||
import decord | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
import torchvision.transforms.functional as TF | import torchvision.transforms.functional as TF | ||||
from decord import VideoReader, cpu | |||||
from PIL import Image | from PIL import Image | ||||
from modelscope.metainfo import Pipelines | from modelscope.metainfo import Pipelines | ||||
@@ -49,6 +47,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||||
logger.info('load model done') | logger.info('load model done') | ||||
def preprocess(self, input: Input) -> Dict[str, Any]: | def preprocess(self, input: Input) -> Dict[str, Any]: | ||||
import decord | |||||
decord.bridge.set_bridge('native') | decord.bridge.set_bridge('native') | ||||
transforms = VCompose([ | transforms = VCompose([ | ||||
@@ -60,7 +59,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||||
clip_len = (self.cfg.DATA.video_frames | clip_len = (self.cfg.DATA.video_frames | ||||
- 1) * self.cfg.DATA.video_stride + 1 | - 1) * self.cfg.DATA.video_stride + 1 | ||||
vr = VideoReader(input, ctx=cpu(0)) | |||||
vr = decord.VideoReader(input, ctx=decord.cpu(0)) | |||||
if len(vr) <= clip_len: | if len(vr) <= clip_len: | ||||
init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) | init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) | ||||
else: | else: | ||||
@@ -16,11 +16,6 @@ from ..base import Pipeline | |||||
from ..builder import PIPELINES | from ..builder import PIPELINES | ||||
from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | ||||
if tf.__version__ >= '2.0': | |||||
import tf_slim as slim | |||||
else: | |||||
from tensorflow.contrib import slim | |||||
if tf.__version__ >= '2.0': | if tf.__version__ >= '2.0': | ||||
tf = tf.compat.v1 | tf = tf.compat.v1 | ||||
tf.compat.v1.disable_eager_execution() | tf.compat.v1.disable_eager_execution() | ||||
@@ -1,15 +1,11 @@ | |||||
import math | import math | ||||
import os | |||||
import random | import random | ||||
import decord | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
import torch.nn as nn | |||||
import torch.utils.data | import torch.utils.data | ||||
import torch.utils.dlpack as dlpack | import torch.utils.dlpack as dlpack | ||||
import torchvision.transforms._transforms_video as transforms | import torchvision.transforms._transforms_video as transforms | ||||
from decord import VideoReader | |||||
from torchvision.transforms import Compose | from torchvision.transforms import Compose | ||||
@@ -128,6 +124,7 @@ def _decode_video(cfg, path): | |||||
Returns: | Returns: | ||||
frames (Tensor): video tensor data | frames (Tensor): video tensor data | ||||
""" | """ | ||||
from decord import VideoReader | |||||
vr = VideoReader(path) | vr = VideoReader(path) | ||||
num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS | num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS | ||||