unfinished

3 years ago · 31c774936b
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -46,6 +46,10 @@ class Pipelines(object):
    word_segmentation = 'word-segmentation'
    text_generation = 'text-generation'
    sentiment_analysis = 'sentiment-analysis'
    sentiment_classification = "sentiment-classification"
    zero_shot_classification = "zero-shot-classification"
    fill_mask = "fill-mask"
    nli = "nli"

    # audio tasks
    sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
@@ -85,10 +89,10 @@ class Preprocessors(object):
    # nlp preprocessor
    bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
    palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
    sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
    sbert_nli_tokenizer = 'sbert-nli-tokenizer'
    sbert_sen_cls_tokenizer = 'sbert-sen-cls-tokenizer'
    sbert_zero_shot_cls_tokenizer = 'sbert-zero-shot-cls-tokenizer'
    token_cls_tokenizer = 'token-cls-tokenizer'
    nli_tokenizer = 'nli-tokenizer'
    sen_cls_tokenizer = 'sen-cls-tokenizer'
    zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'

    # audio preprocessor
    linear_aec_fbank = 'linear-aec-fbank'
--- a/modelscope/models/nlp/masked_language_model.py
+++ b/modelscope/models/nlp/masked_language_model.py
@@ -19,6 +19,12 @@ class MaskedLMModelBase(Model):
    def build_model(self):
        raise NotImplementedError()

    @property
    def config(self):
        if hasattr(self.model, "config"):
            return self.model.config
        return None

    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
        """return the result by the model

--- a/modelscope/models/nlp/sbert_for_nli.py
+++ b/modelscope/models/nlp/sbert_for_nli.py
@@ -1,4 +1,4 @@
 from modelscope.utils.constant import Tasks
 from ...utils.constant import Tasks
 from .sbert_for_sequence_classification import SbertForSequenceClassificationBase
 from ..builder import MODELS
 from ...metainfo import Models
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -2,18 +2,17 @@ from typing import Any, Dict, Union

 import numpy as np
 import torch
 from sofa import SbertConfig, SbertForTokenClassification

 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['StructBertForTokenClassification']
 __all__ = ['SbertForTokenClassification']


@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
 class StructBertForTokenClassification(Model):
 class SbertForTokenClassification(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the word segmentation model from the `model_dir` path.
@@ -25,6 +24,7 @@ class StructBertForTokenClassification(Model):
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        from sofa import SbertConfig, SbertForTokenClassification
        self.model = SbertForTokenClassification.from_pretrained(
            self.model_dir)
        self.config = SbertConfig.from_pretrained(self.model_dir)
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,38 +1,41 @@
 from typing import Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp.masked_language_model import \
    AliceMindBaseForMaskedLM
 from modelscope.preprocessors import FillMaskPreprocessor
 from modelscope.utils.constant import Tasks
 from ...models import Model
 from ...models.nlp.masked_language_model import \
    MaskedLMModelBase
 from ...preprocessors import FillMaskPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 from ...metainfo import Pipelines

 __all__ = ['FillMaskPipeline']


@PIPELINES.register_module(Tasks.fill_mask, module_name=r'sbert')
@PIPELINES.register_module(Tasks.fill_mask, module_name=r'veco')
@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
 class FillMaskPipeline(Pipeline):

    def __init__(self,
                 model: Union[AliceMindBaseForMaskedLM, str],
                 model: Union[MaskedLMModelBase, str],
                 preprocessor: Optional[FillMaskPreprocessor] = None,
                 first_sequence="sentense",
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction

        Args:
            model (AliceMindBaseForMaskedLM): a model instance
            model (MaskedLMModelBase): a model instance
            preprocessor (FillMaskPreprocessor): a preprocessor instance
        """
        fill_mask_model = model if isinstance(
            model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
            model, MaskedLMModelBase) else Model.from_pretrained(model)
        assert fill_mask_model.config is not None

        if preprocessor is None:
            preprocessor = FillMaskPreprocessor(
                fill_mask_model.model_dir,
                first_sequence='sentence',
                first_sequence=first_sequence,
                second_sequence=None)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        super().__init__(model=fill_mask_model, preprocessor=preprocessor, **kwargs)
        self.preprocessor = preprocessor
        self.tokenizer = preprocessor.tokenizer
        self.mask_id = {'veco': 250001, 'sbert': 103}
@@ -82,6 +85,7 @@ class FillMaskPipeline(Pipeline):

        pred_strings = []
        for ids in rst_ids:  # batch
            # TODO vocab size is not stable
            if self.model.config.vocab_size == 21128:  # zh bert
                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
                pred_string = ''.join(pred_string)
--- a/modelscope/pipelines/nlp/nli_pipeline.py
+++ b/modelscope/pipelines/nlp/nli_pipeline.py
@@ -1,27 +1,31 @@
 import os
 import uuid
 from typing import Any, Dict, Union

 import json
 import uuid
 from typing import Any, Dict, Union

 import numpy as np

 from modelscope.models.nlp import SbertForNLI
 from modelscope.preprocessors import NLIPreprocessor
 from modelscope.utils.constant import Tasks
 from ...models import Model
 from ..base import Input, Pipeline
 from ..base import Pipeline
 from ..builder import PIPELINES
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForNLI
 from ...preprocessors import NLIPreprocessor
 from ...utils.constant import Tasks

 __all__ = ['NLIPipeline']


@PIPELINES.register_module(
    Tasks.nli, module_name=r'nlp_structbert_nli_chinese-base')
    Tasks.nli, module_name=Pipelines.nli)
 class NLIPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForNLI, str],
                 preprocessor: NLIPreprocessor = None,
                 first_sequence="first_sequence",
                 second_sequence="second_sequence",
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

@@ -36,20 +40,12 @@ class NLIPipeline(Pipeline):
        if preprocessor is None:
            preprocessor = NLIPreprocessor(
                sc_model.model_dir,
                first_sequence='first_sequence',
                second_sequence='second_sequence')
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
        assert len(sc_model.id2label) > 0

        self.label_path = os.path.join(sc_model.model_dir,
                                       'label_mapping.json')
        with open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.label_id_to_name = {
            idx: name
            for name, idx in self.label_mapping.items()
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -20,6 +20,8 @@ class SentenceSimilarityPipeline(Pipeline):
    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: SequenceClassificationPreprocessor = None,
                 first_sequence="first_sequence",
                 second_sequence="second_sequence",
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction

@@ -35,14 +37,14 @@ class SentenceSimilarityPipeline(Pipeline):
        if preprocessor is None:
            preprocessor = SequenceClassificationPreprocessor(
                sc_model.model_dir,
                first_sequence='first_sequence',
                second_sequence='second_sequence')
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

        assert hasattr(self.model, 'id2label'), \
            'id2label map should be initalizaed in init function.'

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
@@ -5,24 +5,27 @@ from typing import Any, Dict, Union
 import json
 import numpy as np

 from modelscope.models.nlp import SbertForSentimentClassification
 from modelscope.preprocessors import SentimentClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...models.nlp import SbertForSentimentClassification
 from ...preprocessors import SentimentClassificationPreprocessor
 from ...utils.constant import Tasks
 from ...models import Model
 from ..base import Input, Pipeline
 from ..builder import PIPELINES
 from ...metainfo import Pipelines

 __all__ = ['SentimentClassificationPipeline']


@PIPELINES.register_module(
    Tasks.sentiment_classification,
    module_name=r'sbert-sentiment-classification')
    module_name=Pipelines.sentiment_classification)
 class SentimentClassificationPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForSentimentClassification, str],
                 preprocessor: SentimentClassificationPreprocessor = None,
                 first_sequence="first_sequence",
                 second_sequence="second_sequence",
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

@@ -38,20 +41,12 @@ class SentimentClassificationPipeline(Pipeline):
        if preprocessor is None:
            preprocessor = SentimentClassificationPreprocessor(
                sc_model.model_dir,
                first_sequence='first_sequence',
                second_sequence='second_sequence')
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
        assert len(sc_model.id2label) > 0

        self.label_path = os.path.join(sc_model.model_dir,
                                       'label_mapping.json')
        with open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.label_id_to_name = {
            idx: name
            for name, idx in self.label_mapping.items()
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,10 +1,10 @@
 from typing import Dict, Optional, Union

 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import PalmForTextGeneration
 from ...preprocessors import TextGenerationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

@@ -36,7 +36,7 @@ class TextGenerationPipeline(Pipeline):
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = model.tokenizer

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
    def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,10 +1,10 @@
 from typing import Any, Dict, Optional, Union

 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForTokenClassification
 from ...preprocessors import TokenClassifcationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

@@ -16,7 +16,7 @@ __all__ = ['WordSegmentationPipeline']
 class WordSegmentationPipeline(Pipeline):

    def __init__(self,
                 model: Union[StructBertForTokenClassification, str],
                 model: Union[SbertForTokenClassification, str],
                 preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
@@ -27,15 +27,16 @@ class WordSegmentationPipeline(Pipeline):
        """
        model = model if isinstance(
            model,
            StructBertForTokenClassification) else Model.from_pretrained(model)
            SbertForTokenClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TokenClassifcationPreprocessor(model.model_dir)
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        self.config = model.config
        assert len(self.config.id2label) > 0
        self.id2label = self.config.id2label

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -6,10 +6,11 @@ import json
 import numpy as np
 from scipy.special import softmax

 from modelscope.models.nlp import SbertForZeroShotClassification
 from modelscope.preprocessors import SbertZeroShotClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...models.nlp import SbertForZeroShotClassification
 from ...preprocessors import ZeroShotClassificationPreprocessor
 from ...utils.constant import Tasks
 from ...models import Model
 from ...metainfo import Pipelines
 from ..base import Input, Pipeline
 from ..builder import PIPELINES

@@ -18,12 +19,12 @@ __all__ = ['ZeroShotClassificationPipeline']

@PIPELINES.register_module(
    Tasks.zero_shot_classification,
    module_name=r'bert-zero-shot-classification')
    module_name=Pipelines.zero_shot_classification)
 class ZeroShotClassificationPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForZeroShotClassification, str],
                 preprocessor: SbertZeroShotClassificationPreprocessor = None,
                 preprocessor: ZeroShotClassificationPreprocessor = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

@@ -32,7 +33,7 @@ class ZeroShotClassificationPipeline(Pipeline):
            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
        """
        assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
            'model must be a single str or BertForZeroShotClassification'
            'model must be a single str or SbertForZeroShotClassification'
        sc_model = model if isinstance(
            model,
            SbertForZeroShotClassification) else Model.from_pretrained(model)
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -14,9 +14,9 @@ from .builder import PREPROCESSORS

 __all__ = [
    'Tokenize', 'SequenceClassificationPreprocessor',
    'PalmTextGenerationPreprocessor', 'SbertZeroShotClassificationPreprocessor',
    'SbertTokenClassifcationPreprocessor', 'SbertNLIPreprocessor',
    'SbertSentimentClassificationPreprocessor', 'FillMaskPreprocessor'
    'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor',
    'TokenClassifcationPreprocessor', 'NLIPreprocessor',
    'SentimentClassificationPreprocessor', 'FillMaskPreprocessor'
 ]


@@ -35,8 +35,8 @@ class Tokenize(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sbert_nli_tokenizer)
 class SbertNLIPreprocessor(Preprocessor):
    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 class NLIPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
@@ -105,8 +105,8 @@ class SbertNLIPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sbert_sen_cls_tokenizer)
 class SbertSentimentClassificationPreprocessor(Preprocessor):
    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
 class SentimentClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
@@ -264,7 +264,7 @@ class SequenceClassificationPreprocessor(Preprocessor):

@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer)
 class PalmTextGenerationPreprocessor(Preprocessor):
 class TextGenerationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
        """preprocess the data using the vocab.txt from the `model_dir` path
@@ -374,8 +374,8 @@ class FillMaskPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sbert_zero_shot_cls_tokenizer)
 class SbertZeroShotClassificationPreprocessor(Preprocessor):
    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
 class ZeroShotClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
@@ -418,8 +418,8 @@ class SbertZeroShotClassificationPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sbert_token_cls_tokenizer)
 class SbertTokenClassifcationPreprocessor(Preprocessor):
    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
 class TokenClassifcationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path