From 31c774936b329c64ba42685098424ae045619072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= Date: Wed, 22 Jun 2022 19:05:37 +0800 Subject: [PATCH] unfinished --- modelscope/metainfo.py | 12 ++++--- .../models/nlp/masked_language_model.py | 6 ++++ modelscope/models/nlp/sbert_for_nli.py | 2 +- .../nlp/sbert_for_token_classification.py | 6 ++-- .../pipelines/nlp/fill_mask_pipeline.py | 28 ++++++++------- modelscope/pipelines/nlp/nli_pipeline.py | 36 +++++++++---------- .../nlp/sentence_similarity_pipeline.py | 8 +++-- .../nlp/sentiment_classification_pipeline.py | 27 ++++++-------- .../pipelines/nlp/text_generation_pipeline.py | 12 +++---- .../nlp/word_segmentation_pipeline.py | 17 ++++----- .../nlp/zero_shot_classification_pipeline.py | 13 +++---- modelscope/preprocessors/nlp.py | 24 ++++++------- 12 files changed, 100 insertions(+), 91 deletions(-) diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index be965aaa..a8677c16 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -46,6 +46,10 @@ class Pipelines(object): word_segmentation = 'word-segmentation' text_generation = 'text-generation' sentiment_analysis = 'sentiment-analysis' + sentiment_classification = "sentiment-classification" + zero_shot_classification = "zero-shot-classification" + fill_mask = "fill-mask" + nli = "nli" # audio tasks sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts' @@ -85,10 +89,10 @@ class Preprocessors(object): # nlp preprocessor bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' palm_text_gen_tokenizer = 'palm-text-gen-tokenizer' - sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer' - sbert_nli_tokenizer = 'sbert-nli-tokenizer' - sbert_sen_cls_tokenizer = 'sbert-sen-cls-tokenizer' - sbert_zero_shot_cls_tokenizer = 'sbert-zero-shot-cls-tokenizer' + token_cls_tokenizer = 'token-cls-tokenizer' + nli_tokenizer = 'nli-tokenizer' + sen_cls_tokenizer = 'sen-cls-tokenizer' + zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer' # audio preprocessor linear_aec_fbank = 'linear-aec-fbank' diff --git a/modelscope/models/nlp/masked_language_model.py b/modelscope/models/nlp/masked_language_model.py index fe3918aa..4138da94 100644 --- a/modelscope/models/nlp/masked_language_model.py +++ b/modelscope/models/nlp/masked_language_model.py @@ -19,6 +19,12 @@ class MaskedLMModelBase(Model): def build_model(self): raise NotImplementedError() + @property + def config(self): + if hasattr(self.model, "config"): + return self.model.config + return None + def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]: """return the result by the model diff --git a/modelscope/models/nlp/sbert_for_nli.py b/modelscope/models/nlp/sbert_for_nli.py index 2e854317..e41bfb91 100644 --- a/modelscope/models/nlp/sbert_for_nli.py +++ b/modelscope/models/nlp/sbert_for_nli.py @@ -1,4 +1,4 @@ -from modelscope.utils.constant import Tasks +from ...utils.constant import Tasks from .sbert_for_sequence_classification import SbertForSequenceClassificationBase from ..builder import MODELS from ...metainfo import Models diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py index 36cdf78c..1ec848fb 100644 --- a/modelscope/models/nlp/sbert_for_token_classification.py +++ b/modelscope/models/nlp/sbert_for_token_classification.py @@ -2,18 +2,17 @@ from typing import Any, Dict, Union import numpy as np import torch -from sofa import SbertConfig, SbertForTokenClassification from modelscope.metainfo import Models from modelscope.utils.constant import Tasks from ..base import Model, Tensor from ..builder import MODELS -__all__ = ['StructBertForTokenClassification'] +__all__ = ['SbertForTokenClassification'] @MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert) -class StructBertForTokenClassification(Model): +class SbertForTokenClassification(Model): def __init__(self, model_dir: str, *args, **kwargs): """initialize the word segmentation model from the `model_dir` path. @@ -25,6 +24,7 @@ class StructBertForTokenClassification(Model): """ super().__init__(model_dir, *args, **kwargs) self.model_dir = model_dir + from sofa import SbertConfig, SbertForTokenClassification self.model = SbertForTokenClassification.from_pretrained( self.model_dir) self.config = SbertConfig.from_pretrained(self.model_dir) diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index d7c1d456..ebf0e872 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -1,38 +1,41 @@ from typing import Dict, Optional, Union -from modelscope.models import Model -from modelscope.models.nlp.masked_language_model import \ - AliceMindBaseForMaskedLM -from modelscope.preprocessors import FillMaskPreprocessor -from modelscope.utils.constant import Tasks +from ...models import Model +from ...models.nlp.masked_language_model import \ + MaskedLMModelBase +from ...preprocessors import FillMaskPreprocessor +from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES +from ...metainfo import Pipelines __all__ = ['FillMaskPipeline'] -@PIPELINES.register_module(Tasks.fill_mask, module_name=r'sbert') -@PIPELINES.register_module(Tasks.fill_mask, module_name=r'veco') +@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask) class FillMaskPipeline(Pipeline): def __init__(self, - model: Union[AliceMindBaseForMaskedLM, str], + model: Union[MaskedLMModelBase, str], preprocessor: Optional[FillMaskPreprocessor] = None, + first_sequence="sentense", **kwargs): """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction Args: - model (AliceMindBaseForMaskedLM): a model instance + model (MaskedLMModelBase): a model instance preprocessor (FillMaskPreprocessor): a preprocessor instance """ fill_mask_model = model if isinstance( - model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model) + model, MaskedLMModelBase) else Model.from_pretrained(model) + assert fill_mask_model.config is not None + if preprocessor is None: preprocessor = FillMaskPreprocessor( fill_mask_model.model_dir, - first_sequence='sentence', + first_sequence=first_sequence, second_sequence=None) - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + super().__init__(model=fill_mask_model, preprocessor=preprocessor, **kwargs) self.preprocessor = preprocessor self.tokenizer = preprocessor.tokenizer self.mask_id = {'veco': 250001, 'sbert': 103} @@ -82,6 +85,7 @@ class FillMaskPipeline(Pipeline): pred_strings = [] for ids in rst_ids: # batch + # TODO vocab size is not stable if self.model.config.vocab_size == 21128: # zh bert pred_string = self.tokenizer.convert_ids_to_tokens(ids) pred_string = ''.join(pred_string) diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py index 135f826a..fbeb628d 100644 --- a/modelscope/pipelines/nlp/nli_pipeline.py +++ b/modelscope/pipelines/nlp/nli_pipeline.py @@ -1,27 +1,31 @@ -import os import uuid from typing import Any, Dict, Union -import json +import uuid +from typing import Any, Dict, Union + import numpy as np -from modelscope.models.nlp import SbertForNLI -from modelscope.preprocessors import NLIPreprocessor -from modelscope.utils.constant import Tasks -from ...models import Model -from ..base import Input, Pipeline +from ..base import Pipeline from ..builder import PIPELINES +from ...metainfo import Pipelines +from ...models import Model +from ...models.nlp import SbertForNLI +from ...preprocessors import NLIPreprocessor +from ...utils.constant import Tasks __all__ = ['NLIPipeline'] @PIPELINES.register_module( - Tasks.nli, module_name=r'nlp_structbert_nli_chinese-base') + Tasks.nli, module_name=Pipelines.nli) class NLIPipeline(Pipeline): def __init__(self, model: Union[SbertForNLI, str], preprocessor: NLIPreprocessor = None, + first_sequence="first_sequence", + second_sequence="second_sequence", **kwargs): """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction @@ -36,20 +40,12 @@ class NLIPipeline(Pipeline): if preprocessor is None: preprocessor = NLIPreprocessor( sc_model.model_dir, - first_sequence='first_sequence', - second_sequence='second_sequence') + first_sequence=first_sequence, + second_sequence=second_sequence) super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) + assert len(sc_model.id2label) > 0 - self.label_path = os.path.join(sc_model.model_dir, - 'label_mapping.json') - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.label_id_to_name = { - idx: name - for name, idx in self.label_mapping.items() - } - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: """process the prediction results Args: diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py index 95e78260..652c4bfb 100644 --- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py +++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py @@ -20,6 +20,8 @@ class SentenceSimilarityPipeline(Pipeline): def __init__(self, model: Union[Model, str], preprocessor: SequenceClassificationPreprocessor = None, + first_sequence="first_sequence", + second_sequence="second_sequence", **kwargs): """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction @@ -35,14 +37,14 @@ class SentenceSimilarityPipeline(Pipeline): if preprocessor is None: preprocessor = SequenceClassificationPreprocessor( sc_model.model_dir, - first_sequence='first_sequence', - second_sequence='second_sequence') + first_sequence=first_sequence, + second_sequence=second_sequence) super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) assert hasattr(self.model, 'id2label'), \ 'id2label map should be initalizaed in init function.' - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: """process the prediction results Args: diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py index 818c792d..62a30f8f 100644 --- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py @@ -5,24 +5,27 @@ from typing import Any, Dict, Union import json import numpy as np -from modelscope.models.nlp import SbertForSentimentClassification -from modelscope.preprocessors import SentimentClassificationPreprocessor -from modelscope.utils.constant import Tasks +from ...models.nlp import SbertForSentimentClassification +from ...preprocessors import SentimentClassificationPreprocessor +from ...utils.constant import Tasks from ...models import Model from ..base import Input, Pipeline from ..builder import PIPELINES +from ...metainfo import Pipelines __all__ = ['SentimentClassificationPipeline'] @PIPELINES.register_module( Tasks.sentiment_classification, - module_name=r'sbert-sentiment-classification') + module_name=Pipelines.sentiment_classification) class SentimentClassificationPipeline(Pipeline): def __init__(self, model: Union[SbertForSentimentClassification, str], preprocessor: SentimentClassificationPreprocessor = None, + first_sequence="first_sequence", + second_sequence="second_sequence", **kwargs): """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction @@ -38,20 +41,12 @@ class SentimentClassificationPipeline(Pipeline): if preprocessor is None: preprocessor = SentimentClassificationPreprocessor( sc_model.model_dir, - first_sequence='first_sequence', - second_sequence='second_sequence') + first_sequence=first_sequence, + second_sequence=second_sequence) super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) + assert len(sc_model.id2label) > 0 - self.label_path = os.path.join(sc_model.model_dir, - 'label_mapping.json') - with open(self.label_path) as f: - self.label_mapping = json.load(f) - self.label_id_to_name = { - idx: name - for name, idx in self.label_mapping.items() - } - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: """process the prediction results Args: diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index ebd4be8e..6efc8de9 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -1,10 +1,10 @@ from typing import Dict, Optional, Union -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import PalmForTextGeneration -from modelscope.preprocessors import TextGenerationPreprocessor -from modelscope.utils.constant import Tasks +from ...metainfo import Pipelines +from ...models import Model +from ...models.nlp import PalmForTextGeneration +from ...preprocessors import TextGenerationPreprocessor +from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES @@ -36,7 +36,7 @@ class TextGenerationPipeline(Pipeline): super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.tokenizer = model.tokenizer - def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]: + def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params) -> Dict[str, str]: """process the prediction results Args: diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index a45dafc3..70fcc7aa 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -1,10 +1,10 @@ from typing import Any, Dict, Optional, Union -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.models.nlp import StructBertForTokenClassification -from modelscope.preprocessors import TokenClassifcationPreprocessor -from modelscope.utils.constant import Tasks +from ...metainfo import Pipelines +from ...models import Model +from ...models.nlp import SbertForTokenClassification +from ...preprocessors import TokenClassifcationPreprocessor +from ...utils.constant import Tasks from ..base import Pipeline, Tensor from ..builder import PIPELINES @@ -16,7 +16,7 @@ __all__ = ['WordSegmentationPipeline'] class WordSegmentationPipeline(Pipeline): def __init__(self, - model: Union[StructBertForTokenClassification, str], + model: Union[SbertForTokenClassification, str], preprocessor: Optional[TokenClassifcationPreprocessor] = None, **kwargs): """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction @@ -27,15 +27,16 @@ class WordSegmentationPipeline(Pipeline): """ model = model if isinstance( model, - StructBertForTokenClassification) else Model.from_pretrained(model) + SbertForTokenClassification) else Model.from_pretrained(model) if preprocessor is None: preprocessor = TokenClassifcationPreprocessor(model.model_dir) super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.tokenizer = preprocessor.tokenizer self.config = model.config + assert len(self.config.id2label) > 0 self.id2label = self.config.id2label - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: """process the prediction results Args: diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index e703464a..5753324b 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -6,10 +6,11 @@ import json import numpy as np from scipy.special import softmax -from modelscope.models.nlp import SbertForZeroShotClassification -from modelscope.preprocessors import SbertZeroShotClassificationPreprocessor -from modelscope.utils.constant import Tasks +from ...models.nlp import SbertForZeroShotClassification +from ...preprocessors import ZeroShotClassificationPreprocessor +from ...utils.constant import Tasks from ...models import Model +from ...metainfo import Pipelines from ..base import Input, Pipeline from ..builder import PIPELINES @@ -18,12 +19,12 @@ __all__ = ['ZeroShotClassificationPipeline'] @PIPELINES.register_module( Tasks.zero_shot_classification, - module_name=r'bert-zero-shot-classification') + module_name=Pipelines.zero_shot_classification) class ZeroShotClassificationPipeline(Pipeline): def __init__(self, model: Union[SbertForZeroShotClassification, str], - preprocessor: SbertZeroShotClassificationPreprocessor = None, + preprocessor: ZeroShotClassificationPreprocessor = None, **kwargs): """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction @@ -32,7 +33,7 @@ class ZeroShotClassificationPipeline(Pipeline): preprocessor (SentimentClassificationPreprocessor): a preprocessor instance """ assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \ - 'model must be a single str or BertForZeroShotClassification' + 'model must be a single str or SbertForZeroShotClassification' sc_model = model if isinstance( model, SbertForZeroShotClassification) else Model.from_pretrained(model) diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 26cd79d8..d19b4f20 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -14,9 +14,9 @@ from .builder import PREPROCESSORS __all__ = [ 'Tokenize', 'SequenceClassificationPreprocessor', - 'PalmTextGenerationPreprocessor', 'SbertZeroShotClassificationPreprocessor', - 'SbertTokenClassifcationPreprocessor', 'SbertNLIPreprocessor', - 'SbertSentimentClassificationPreprocessor', 'FillMaskPreprocessor' + 'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor', + 'TokenClassifcationPreprocessor', 'NLIPreprocessor', + 'SentimentClassificationPreprocessor', 'FillMaskPreprocessor' ] @@ -35,8 +35,8 @@ class Tokenize(Preprocessor): @PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sbert_nli_tokenizer) -class SbertNLIPreprocessor(Preprocessor): + Fields.nlp, module_name=Preprocessors.nli_tokenizer) +class NLIPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -105,8 +105,8 @@ class SbertNLIPreprocessor(Preprocessor): @PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sbert_sen_cls_tokenizer) -class SbertSentimentClassificationPreprocessor(Preprocessor): + Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer) +class SentimentClassificationPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -264,7 +264,7 @@ class SequenceClassificationPreprocessor(Preprocessor): @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer) -class PalmTextGenerationPreprocessor(Preprocessor): +class TextGenerationPreprocessor(Preprocessor): def __init__(self, model_dir: str, tokenizer, *args, **kwargs): """preprocess the data using the vocab.txt from the `model_dir` path @@ -374,8 +374,8 @@ class FillMaskPreprocessor(Preprocessor): @PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sbert_zero_shot_cls_tokenizer) -class SbertZeroShotClassificationPreprocessor(Preprocessor): + Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) +class ZeroShotClassificationPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path @@ -418,8 +418,8 @@ class SbertZeroShotClassificationPreprocessor(Preprocessor): @PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.sbert_token_cls_tokenizer) -class SbertTokenClassifcationPreprocessor(Preprocessor): + Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) +class TokenClassifcationPreprocessor(Preprocessor): def __init__(self, model_dir: str, *args, **kwargs): """preprocess the data via the vocab.txt from the `model_dir` path