Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9409213master
@@ -1,8 +1,5 @@ | |||
from typing import Dict | |||
import numpy as np | |||
from rouge_score import rouge_scorer | |||
from ..metainfo import Metrics | |||
from ..utils.registry import default_group | |||
from .base import Metric | |||
@@ -18,6 +15,7 @@ class TextGenerationMetric(Metric): | |||
def __init__(self): | |||
self.preds = [] | |||
self.tgts = [] | |||
from rouge_score import rouge_scorer | |||
self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) | |||
def add(self, outputs: Dict, inputs: Dict): | |||
@@ -1,7 +1,4 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib.cudnn_rnn import CudnnLSTM | |||
from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops | |||
from tensorflow.contrib.rnn import LSTMBlockCell | |||
def encoder_prenet(inputs, | |||
@@ -207,6 +204,7 @@ def conv_and_lstm(inputs, | |||
embedded_inputs_speaker, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
from tensorflow.contrib.rnn import LSTMBlockCell | |||
x = inputs | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
@@ -244,6 +242,7 @@ def conv_and_lstm_dec(inputs, | |||
mask=None, | |||
scope='conv_and_lstm'): | |||
x = inputs | |||
from tensorflow.contrib.rnn import LSTMBlockCell | |||
with tf.variable_scope(scope): | |||
for i in range(n_conv_layers): | |||
x = conv1d( | |||
@@ -1,9 +1,8 @@ | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.seq2seq import Helper | |||
class VarTestHelper(Helper): | |||
class VarTestHelper(tf.contrib.seq2seq.Helper): | |||
def __init__(self, batch_size, inputs, dim): | |||
with tf.name_scope('VarTestHelper'): | |||
@@ -44,7 +43,7 @@ class VarTestHelper(Helper): | |||
return (finished, next_inputs, state) | |||
class VarTrainingHelper(Helper): | |||
class VarTrainingHelper(tf.contrib.seq2seq.Helper): | |||
def __init__(self, targets, inputs, dim): | |||
with tf.name_scope('VarTrainingHelper'): | |||
@@ -86,7 +85,7 @@ class VarTrainingHelper(Helper): | |||
return (finished, next_inputs, state) | |||
class VarTrainingSSHelper(Helper): | |||
class VarTrainingSSHelper(tf.contrib.seq2seq.Helper): | |||
def __init__(self, targets, inputs, dim, global_step, schedule_begin, | |||
alpha, decay_steps): | |||
@@ -1,14 +1,11 @@ | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.rnn import RNNCell | |||
from tensorflow.contrib.seq2seq import AttentionWrapperState | |||
from tensorflow.python.ops import rnn_cell_impl | |||
from .am_models import prenet | |||
class VarPredictorCell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
class VarPredictorCell(tf.contrib.rnn.RNNCell): | |||
"""Wrapper wrapper knock knock.""" | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(VarPredictorCell, self).__init__() | |||
@@ -33,7 +30,7 @@ class VarPredictorCell(RNNCell): | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
"""Run the Tacotron2 super decoder cell.""" | |||
super_cell_out, decoder_state = state | |||
# split | |||
@@ -61,8 +58,8 @@ class VarPredictorCell(RNNCell): | |||
return new_super_cell_out, new_states | |||
class DurPredictorCell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
class DurPredictorCell(tf.contrib.rnn.RNNCell): | |||
"""Wrapper wrapper knock knock.""" | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(DurPredictorCell, self).__init__() | |||
@@ -87,7 +84,7 @@ class DurPredictorCell(RNNCell): | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
"""Run the Tacotron2 super decoder cell.""" | |||
super_cell_out, decoder_state = state | |||
# split | |||
@@ -117,8 +114,8 @@ class DurPredictorCell(RNNCell): | |||
return new_super_cell_out, new_states | |||
class DurPredictorCECell(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
class DurPredictorCECell(tf.contrib.rnn.RNNCell): | |||
"""Wrapper wrapper knock knock.""" | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units, | |||
max_dur, dur_embedding_dim): | |||
@@ -146,7 +143,7 @@ class DurPredictorCECell(RNNCell): | |||
]) | |||
def call(self, inputs, state): | |||
'''Run the Tacotron2 super decoder cell.''' | |||
"""Run the Tacotron2 super decoder cell.""" | |||
super_cell_out, decoder_state = state | |||
# split | |||
@@ -181,8 +178,8 @@ class DurPredictorCECell(RNNCell): | |||
return new_super_cell_out, new_states | |||
class VarPredictorCell2(RNNCell): | |||
'''Wrapper wrapper knock knock.''' | |||
class VarPredictorCell2(tf.contrib.rnn.RNNCell): | |||
"""Wrapper wrapper knock knock.""" | |||
def __init__(self, var_predictor_cell, is_training, dim, prenet_units): | |||
super(VarPredictorCell2, self).__init__() | |||
@@ -1,14 +1,8 @@ | |||
import tensorflow as tf | |||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
from tensorflow.contrib.seq2seq import BasicDecoder | |||
from tensorflow.python.ops.ragged.ragged_util import repeat | |||
from .am_models import conv_prenet, decoder_prenet, encoder_prenet | |||
from .fsmn_encoder import FsmnEncoderV2 | |||
from .helpers import VarTestHelper, VarTrainingHelper | |||
from .position import (BatchSinusodalPositionalEncoding, | |||
SinusodalPositionalEncoding) | |||
from .rnn_wrappers import DurPredictorCell, VarPredictorCell | |||
from .position import BatchSinusodalPositionalEncoding | |||
from .self_attention_decoder import SelfAttentionDecoder | |||
from .self_attention_encoder import SelfAttentionEncoder | |||
@@ -32,7 +26,7 @@ class RobuTrans(): | |||
duration_scales=None, | |||
energy_contours=None, | |||
energy_scales=None): | |||
'''Initializes the model for inference. | |||
"""Initializes the model for inference. | |||
Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields. | |||
@@ -46,7 +40,10 @@ class RobuTrans(): | |||
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number | |||
of steps in the output time series, M is num_mels, and values are entries in the mel | |||
spectrogram. Only needed for training. | |||
''' | |||
""" | |||
from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell | |||
from tensorflow.contrib.seq2seq import BasicDecoder | |||
with tf.variable_scope('inference') as _: | |||
is_training = mel_targets is not None | |||
batch_size = tf.shape(inputs)[0] | |||
@@ -229,17 +226,20 @@ class RobuTrans(): | |||
LSTMBlockCell(hp.predictor_lstm_units), | |||
LSTMBlockCell(hp.predictor_lstm_units) | |||
], state_is_tuple=True) # yapf:disable | |||
from .rnn_wrappers import DurPredictorCell | |||
duration_output_cell = DurPredictorCell( | |||
duration_predictor_cell, is_training, 1, | |||
hp.predictor_prenet_units) | |||
duration_predictor_init_state = duration_output_cell.zero_state( | |||
batch_size=batch_size, dtype=tf.float32) | |||
if is_training: | |||
from .helpers import VarTrainingHelper | |||
duration_helper = VarTrainingHelper( | |||
tf.expand_dims( | |||
tf.log(tf.cast(durations, tf.float32) + 1), | |||
axis=2), dur_inputs, 1) | |||
else: | |||
from .helpers import VarTestHelper | |||
duration_helper = VarTestHelper(batch_size, dur_inputs, 1) | |||
( | |||
duration_outputs, _ | |||
@@ -1,14 +1,10 @@ | |||
from __future__ import (absolute_import, division, print_function, | |||
unicode_literals) | |||
import io | |||
import os | |||
import time | |||
import zipfile | |||
from typing import Any, Dict, Optional, Union | |||
import json | |||
import numpy as np | |||
import torch | |||
from modelscope.metainfo import Models | |||
from modelscope.models.base import Model | |||
@@ -16,8 +12,8 @@ from modelscope.models.builder import MODELS | |||
from modelscope.utils.audio.tts_exceptions import ( | |||
TtsFrontendInitializeFailedException, | |||
TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion, | |||
TtsVocoderMelspecShapeMismatchException, TtsVoiceNotExistsException) | |||
from modelscope.utils.constant import ModelFile, Tasks | |||
TtsVoiceNotExistsException) | |||
from modelscope.utils.constant import Tasks | |||
from .voice import Voice | |||
import tensorflow as tf # isort:skip | |||
@@ -23,8 +23,8 @@ logger = get_logger() | |||
class Pipeline(ABC): | |||
def initiate_single_model(self, model): | |||
logger.info(f'initiate model from {model}') | |||
if isinstance(model, str) and is_official_hub_path(model): | |||
logger.info(f'initiate model from location {model}.') | |||
# expecting model has been prefetched to local cache beforehand | |||
return Model.from_pretrained( | |||
model, model_prefetched=True) if is_model(model) else model | |||
@@ -1,11 +1,9 @@ | |||
import os.path as osp | |||
from typing import Any, Dict | |||
import decord | |||
import numpy as np | |||
import torch | |||
import torchvision.transforms.functional as TF | |||
from decord import VideoReader, cpu | |||
from PIL import Image | |||
from modelscope.metainfo import Pipelines | |||
@@ -49,6 +47,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||
logger.info('load model done') | |||
def preprocess(self, input: Input) -> Dict[str, Any]: | |||
import decord | |||
decord.bridge.set_bridge('native') | |||
transforms = VCompose([ | |||
@@ -60,7 +59,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline): | |||
clip_len = (self.cfg.DATA.video_frames | |||
- 1) * self.cfg.DATA.video_stride + 1 | |||
vr = VideoReader(input, ctx=cpu(0)) | |||
vr = decord.VideoReader(input, ctx=decord.cpu(0)) | |||
if len(vr) <= clip_len: | |||
init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int) | |||
else: | |||
@@ -16,11 +16,6 @@ from ..base import Pipeline | |||
from ..builder import PIPELINES | |||
from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | |||
if tf.__version__ >= '2.0': | |||
import tf_slim as slim | |||
else: | |||
from tensorflow.contrib import slim | |||
if tf.__version__ >= '2.0': | |||
tf = tf.compat.v1 | |||
tf.compat.v1.disable_eager_execution() | |||
@@ -1,15 +1,11 @@ | |||
import math | |||
import os | |||
import random | |||
import decord | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.utils.data | |||
import torch.utils.dlpack as dlpack | |||
import torchvision.transforms._transforms_video as transforms | |||
from decord import VideoReader | |||
from torchvision.transforms import Compose | |||
@@ -128,6 +124,7 @@ def _decode_video(cfg, path): | |||
Returns: | |||
frames (Tensor): video tensor data | |||
""" | |||
from decord import VideoReader | |||
vr = VideoReader(path) | |||
num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS | |||