[to #42322933] make decord and tf.contrib lazy load and clean import

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9409213
3 years ago · a68e3e526a
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -1,8 +1,5 @@
 from typing import Dict

 import numpy as np
 from rouge_score import rouge_scorer

 from ..metainfo import Metrics
 from ..utils.registry import default_group
 from .base import Metric
@@ -18,6 +15,7 @@ class TextGenerationMetric(Metric):
    def __init__(self):
        self.preds = []
        self.tgts = []
        from rouge_score import rouge_scorer
        self.scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    def add(self, outputs: Dict, inputs: Dict):
--- a/modelscope/models/audio/tts/models/am_models.py
+++ b/modelscope/models/audio/tts/models/am_models.py
@@ -1,7 +1,4 @@
 import tensorflow as tf
 from tensorflow.contrib.cudnn_rnn import CudnnLSTM
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn import LSTMBlockCell


 def encoder_prenet(inputs,
@@ -207,6 +204,7 @@ def conv_and_lstm(inputs,
                  embedded_inputs_speaker,
                  mask=None,
                  scope='conv_and_lstm'):
    from tensorflow.contrib.rnn import LSTMBlockCell
    x = inputs
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
@@ -244,6 +242,7 @@ def conv_and_lstm_dec(inputs,
                      mask=None,
                      scope='conv_and_lstm'):
    x = inputs
    from tensorflow.contrib.rnn import LSTMBlockCell
    with tf.variable_scope(scope):
        for i in range(n_conv_layers):
            x = conv1d(
--- a/modelscope/models/audio/tts/models/helpers.py
+++ b/modelscope/models/audio/tts/models/helpers.py
@@ -1,9 +1,8 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.contrib.seq2seq import Helper


 class VarTestHelper(Helper):
 class VarTestHelper(tf.contrib.seq2seq.Helper):

    def __init__(self, batch_size, inputs, dim):
        with tf.name_scope('VarTestHelper'):
@@ -44,7 +43,7 @@ class VarTestHelper(Helper):
            return (finished, next_inputs, state)


 class VarTrainingHelper(Helper):
 class VarTrainingHelper(tf.contrib.seq2seq.Helper):

    def __init__(self, targets, inputs, dim):
        with tf.name_scope('VarTrainingHelper'):
@@ -86,7 +85,7 @@ class VarTrainingHelper(Helper):
            return (finished, next_inputs, state)


 class VarTrainingSSHelper(Helper):
 class VarTrainingSSHelper(tf.contrib.seq2seq.Helper):

    def __init__(self, targets, inputs, dim, global_step, schedule_begin,
                 alpha, decay_steps):
--- a/modelscope/models/audio/tts/models/rnn_wrappers.py
+++ b/modelscope/models/audio/tts/models/rnn_wrappers.py
@@ -1,14 +1,11 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.contrib.rnn import RNNCell
 from tensorflow.contrib.seq2seq import AttentionWrapperState
 from tensorflow.python.ops import rnn_cell_impl

 from .am_models import prenet


 class VarPredictorCell(RNNCell):
    '''Wrapper wrapper knock knock.'''
 class VarPredictorCell(tf.contrib.rnn.RNNCell):
    """Wrapper wrapper knock knock."""

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
        super(VarPredictorCell, self).__init__()
@@ -33,7 +30,7 @@ class VarPredictorCell(RNNCell):
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        """Run the Tacotron2 super decoder cell."""
        super_cell_out, decoder_state = state

        # split
@@ -61,8 +58,8 @@ class VarPredictorCell(RNNCell):
        return new_super_cell_out, new_states


 class DurPredictorCell(RNNCell):
    '''Wrapper wrapper knock knock.'''
 class DurPredictorCell(tf.contrib.rnn.RNNCell):
    """Wrapper wrapper knock knock."""

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
        super(DurPredictorCell, self).__init__()
@@ -87,7 +84,7 @@ class DurPredictorCell(RNNCell):
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        """Run the Tacotron2 super decoder cell."""
        super_cell_out, decoder_state = state

        # split
@@ -117,8 +114,8 @@ class DurPredictorCell(RNNCell):
        return new_super_cell_out, new_states


 class DurPredictorCECell(RNNCell):
    '''Wrapper wrapper knock knock.'''
 class DurPredictorCECell(tf.contrib.rnn.RNNCell):
    """Wrapper wrapper knock knock."""

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units,
                 max_dur, dur_embedding_dim):
@@ -146,7 +143,7 @@ class DurPredictorCECell(RNNCell):
        ])

    def call(self, inputs, state):
        '''Run the Tacotron2 super decoder cell.'''
        """Run the Tacotron2 super decoder cell."""
        super_cell_out, decoder_state = state

        # split
@@ -181,8 +178,8 @@ class DurPredictorCECell(RNNCell):
        return new_super_cell_out, new_states


 class VarPredictorCell2(RNNCell):
    '''Wrapper wrapper knock knock.'''
 class VarPredictorCell2(tf.contrib.rnn.RNNCell):
    """Wrapper wrapper knock knock."""

    def __init__(self, var_predictor_cell, is_training, dim, prenet_units):
        super(VarPredictorCell2, self).__init__()
--- a/modelscope/models/audio/tts/models/robutrans.py
+++ b/modelscope/models/audio/tts/models/robutrans.py
@@ -1,14 +1,8 @@
 import tensorflow as tf
 from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
 from tensorflow.contrib.seq2seq import BasicDecoder
 from tensorflow.python.ops.ragged.ragged_util import repeat

 from .am_models import conv_prenet, decoder_prenet, encoder_prenet
 from .fsmn_encoder import FsmnEncoderV2
 from .helpers import VarTestHelper, VarTrainingHelper
 from .position import (BatchSinusodalPositionalEncoding,
                       SinusodalPositionalEncoding)
 from .rnn_wrappers import DurPredictorCell, VarPredictorCell
 from .position import BatchSinusodalPositionalEncoding
 from .self_attention_decoder import SelfAttentionDecoder
 from .self_attention_encoder import SelfAttentionEncoder

@@ -32,7 +26,7 @@ class RobuTrans():
                   duration_scales=None,
                   energy_contours=None,
                   energy_scales=None):
        '''Initializes the model for inference.
        """Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", "stop_token_outputs", and "alignments" fields.

@@ -46,7 +40,10 @@ class RobuTrans():
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        '''
        """
        from tensorflow.contrib.rnn import LSTMBlockCell, MultiRNNCell
        from tensorflow.contrib.seq2seq import BasicDecoder

        with tf.variable_scope('inference') as _:
            is_training = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
@@ -229,17 +226,20 @@ class RobuTrans():
                    LSTMBlockCell(hp.predictor_lstm_units),
                    LSTMBlockCell(hp.predictor_lstm_units)
                ], state_is_tuple=True)  # yapf:disable
                from .rnn_wrappers import DurPredictorCell
                duration_output_cell = DurPredictorCell(
                    duration_predictor_cell, is_training, 1,
                    hp.predictor_prenet_units)
                duration_predictor_init_state = duration_output_cell.zero_state(
                    batch_size=batch_size, dtype=tf.float32)
                if is_training:
                    from .helpers import VarTrainingHelper
                    duration_helper = VarTrainingHelper(
                        tf.expand_dims(
                            tf.log(tf.cast(durations, tf.float32) + 1),
                            axis=2), dur_inputs, 1)
                else:
                    from .helpers import VarTestHelper
                    duration_helper = VarTestHelper(batch_size, dur_inputs, 1)
                (
                    duration_outputs, _
--- a/modelscope/models/audio/tts/sambert_hifi.py
+++ b/modelscope/models/audio/tts/sambert_hifi.py
@@ -1,14 +1,10 @@
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import io
 import os
 import time
 import zipfile
 from typing import Any, Dict, Optional, Union

 import json
 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base import Model
@@ -16,8 +12,8 @@ from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
    TtsFrontendInitializeFailedException,
    TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationExcetion,
    TtsVocoderMelspecShapeMismatchException, TtsVoiceNotExistsException)
 from modelscope.utils.constant import ModelFile, Tasks
    TtsVoiceNotExistsException)
 from modelscope.utils.constant import Tasks
 from .voice import Voice

 import tensorflow as tf  # isort:skip
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -23,8 +23,8 @@ logger = get_logger()
 class Pipeline(ABC):

    def initiate_single_model(self, model):
        logger.info(f'initiate model from {model}')
        if isinstance(model, str) and is_official_hub_path(model):
            logger.info(f'initiate model from location {model}.')
            # expecting model has been prefetched to local cache beforehand
            return Model.from_pretrained(
                model, model_prefetched=True) if is_model(model) else model
--- a/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py
+++ b/modelscope/pipelines/cv/cmdssl_video_embedding_pipleline.py
@@ -1,11 +1,9 @@
 import os.path as osp
 from typing import Any, Dict

 import decord
 import numpy as np
 import torch
 import torchvision.transforms.functional as TF
 from decord import VideoReader, cpu
 from PIL import Image

 from modelscope.metainfo import Pipelines
@@ -49,6 +47,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline):
        logger.info('load model done')

    def preprocess(self, input: Input) -> Dict[str, Any]:
        import decord
        decord.bridge.set_bridge('native')

        transforms = VCompose([
@@ -60,7 +59,7 @@ class CMDSSLVideoEmbeddingPipeline(Pipeline):

        clip_len = (self.cfg.DATA.video_frames
                    - 1) * self.cfg.DATA.video_stride + 1
        vr = VideoReader(input, ctx=cpu(0))
        vr = decord.VideoReader(input, ctx=decord.cpu(0))
        if len(vr) <= clip_len:
            init_frames = np.zeros(self.cfg.DATA.multi_crop, dtype=int)
        else:
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -16,11 +16,6 @@ from ..base import Pipeline
 from ..builder import PIPELINES
 from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils

 if tf.__version__ >= '2.0':
    import tf_slim as slim
 else:
    from tensorflow.contrib import slim

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1
 tf.compat.v1.disable_eager_execution()
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -1,15 +1,11 @@
 import math
 import os
 import random

 import decord
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.utils.data
 import torch.utils.dlpack as dlpack
 import torchvision.transforms._transforms_video as transforms
 from decord import VideoReader
 from torchvision.transforms import Compose


@@ -128,6 +124,7 @@ def _decode_video(cfg, path):
        Returns:
            frames            (Tensor): video tensor data
    """
    from decord import VideoReader
    vr = VideoReader(path)

    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS