modelscope
/
ModelScope

 
			
							# Copyright (c) Alibaba, Inc. and its affiliates.

import os.path as osp
from typing import Any, Dict, Optional, Union

import numpy as np
import torch

from modelscope.metainfo import Preprocessors
from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.builder import PREPROCESSORS
from modelscope.utils.constant import Fields, ModeKeys
from modelscope.utils.hub import get_model_type
from modelscope.utils.logger import get_logger
from .transformers_tokenizer import NLPTokenizer
from .utils import parse_text_and_label

logger = get_logger(__name__)


class TextGenerationPreprocessorBase(Preprocessor):

    def __init__(self,
                 mode: str = ModeKeys.INFERENCE,
                 src_txt='src_txt',
                 tgt_txt='tgt_txt'):
        """The base class for all the text generation task's preprocessors.

        Args:
            mode: The preprocessor mode.
            src_txt: The key for the src text.
            tgt_txt: The key for the tgt text.
        """
        super().__init__(mode)
        self.src_txt = src_txt
        self.tgt_txt = tgt_txt

    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
        """Tokenize the text.

        Args:
            sequence1: The first sequence.
            sequence2: The second sequence which may be None.

        Returns:
            The encoded sequence.
        """
        raise NotImplementedError()

    def __call__(self, data: Union[Dict, str], **kwargs) -> Dict[str, Any]:
        text_a, text_b = parse_text_and_label(data, self.mode, self.src_txt,
                                              self.tgt_txt)[0:2]

        output = self._tokenize_text(text_a, text_b, **kwargs)
        output = {
            k: np.array(v) if isinstance(v, list) else v
            for k, v in output.items()
        }
        return output

    def decode(self, tokens, **kwargs):
        """Decode the tokens to real text.

        Args:
            tokens: The output tokens from model's `forward` and `generate`

        Returns:
            The actual text.
        """
        raise NotImplementedError()


class NLPTokenizerForRoberta(NLPTokenizer):

    def build_tokenizer(self):

        def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
            import os
            for name in os.listdir(model_dir):
                full_name = os.path.join(model_dir, name)
                if 'roberta' in name and os.path.isdir(full_name):
                    return full_name

        roberta_tokenizer_dir = get_roberta_tokenizer_dir(self.model_dir)
        if roberta_tokenizer_dir:
            from transformers import RobertaTokenizer
            return RobertaTokenizer.from_pretrained(
                roberta_tokenizer_dir, do_lower_case=False)
        return super().build_tokenizer()


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):

    def __init__(self,
                 model_dir: str,
                 tokenizer=None,
                 mode: str = ModeKeys.INFERENCE,
                 src_txt='src_txt',
                 tgt_txt='tgt_txt',
                 sequence_length: int = 128,
                 use_fast: bool = None,
                 **kwargs):
        """The tokenizer preprocessor used in text generation.

        Args:
            model_dir: The model dir used to initialize the tokenizer.
            mode: The mode for the preprocessor.
            src_txt: The key of the source sentence.
            tgt_txt: The key of the generated sentence.
            sequence_length: The max sequence length which the model supported,
                will be passed into tokenizer as the 'max_length' param.
            use_fast: Whether to use the fast tokenizer or not.
            **kwargs: Extra args input into the tokenizer's __call__ method.
        """
        if 'first_sequence' in kwargs:
            src_txt = kwargs.pop('first_sequence')
        super().__init__(mode, src_txt, tgt_txt)
        kwargs['truncation'] = kwargs.get('truncation', True)
        kwargs['padding'] = kwargs.get('padding', 'max_length')
        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                     False)
        kwargs['max_length'] = sequence_length
        self.src_length = kwargs['max_length']
        self.tgt_length = kwargs.pop('target_max_length', kwargs['max_length'])
        model_type = None
        if model_dir is not None:
            model_type = get_model_type(model_dir)
        self.nlp_tokenizer = NLPTokenizerForRoberta(
            model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)

    def decode(self, tokens, **kwargs):
        """Decode the tokens to real text.

        Args:
            tokens: The output tokens from model's `forward` and `generate`

        Returns:
            The actual text.
        """
        return self.nlp_tokenizer.tokenizer.decode(tokens, **kwargs)

    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
        """Tokenize the text.

        Args:
            sequence1: The first sequence.
            sequence2: The second sequence which may be None.

        Returns:
            The encoded sequence.
        """
        if 'return_tensors' not in kwargs:
            kwargs[
                'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None

        output = self.nlp_tokenizer(sequence1, **kwargs)
        if self.mode != ModeKeys.INFERENCE:
            if sequence2 is not None:
                self.nlp_tokenizer.tokenize_kwargs[
                    'max_length'] = self.tgt_length
                labels = self.nlp_tokenizer(sequence2)['input_ids']
                self.nlp_tokenizer.tokenize_kwargs[
                    'max_length'] = self.src_length

                src_input_ids = output['input_ids']
                src_attention_mask = output['attention_mask']
            else:
                labels = output['input_ids'][1:]
                src_input_ids = output['input_ids'][:-1]
                src_attention_mask = output['attention_mask'][:-1]

            output = {
                'input_ids': src_input_ids,
                'attention_mask': src_attention_mask,
                'labels': labels,
            }
        return output


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
class TextGenerationJiebaPreprocessor(TextGenerationPreprocessorBase):
    """The jieba tokenizer preprocessor used in text generation.
    """

    def __init__(self,
                 model_dir: str,
                 mode: str = ModeKeys.INFERENCE,
                 src_txt='src_txt',
                 tgt_txt=None):
        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
        super().__init__(mode, src_txt, tgt_txt)
        if self.tgt_txt is not None:
            logger.warn(
                f'TextGenerationJiebaPreprocessor currently does not support training, '
                f'the {self.tgt_txt} of the tgt_txt field will be ignored.')
        self.src_txt = src_txt
        self.tokenizer = JiebaBPETokenizer(
            osp.join(model_dir, 'tokenizer.json'))

    def decode(self, tokens, **kwargs):
        """Decode the tokens to real text.

        Args:
            tokens: The output tokens from model's `forward` and `generate`

        Returns:
            The actual text.
        """
        return self.tokenizer.detokenize(tokens)

    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
        """Tokenize the text.

        Args:
            sequence1: The first sequence.
            sequence2: The second sequence which may be None.

        Returns:
            The encoded sequence.
        """
        return {
            'input_ids':
            torch.tensor(self.tokenizer.tokenize(sequence1)).unsqueeze_(0)
        }


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):

    def __init__(self,
                 model_dir: str,
                 mode: str = ModeKeys.INFERENCE,
                 src_txt='src_txt',
                 tgt_txt='tgt_txt',
                 use_fast: bool = None,
                 sequence_length: int = 128,
                 **kwargs):
        """The preprocessor for text to text generation task, based on transformers' tokenizer.

        Args:
            model_dir: The model dir used to initialize the tokenizer.
            src_txt: The key of the first sequence.
            use_fast: Use the fast tokenizer or not.
            sequence_length: The max sequence length which the model supported,
                will be passed into tokenizer as the 'max_length' param.
            mode: The mode for the preprocessor.
            **kwargs: Extra args input into the tokenizer's __call__ method.
        """
        super().__init__(
            model_dir,
            mode=mode,
            src_txt=src_txt,
            tgt_txt=tgt_txt,
            sequence_length=sequence_length,
            use_fast=use_fast,
            truncation=kwargs.pop('truncation', True),
            padding=kwargs.pop('padding', 'max_length'),
            return_token_type_ids=kwargs.pop('return_token_type_ids', False),
            **kwargs)