From 31c774936b329c64ba42685098424ae045619072 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=A8=E6=B3=93?= <yuze.zyz@alibaba-inc.com>
Date: Wed, 22 Jun 2022 19:05:37 +0800
Subject: [PATCH] unfinished

---
 modelscope/metainfo.py                        | 12 ++++---
 .../models/nlp/masked_language_model.py       |  6 ++++
 modelscope/models/nlp/sbert_for_nli.py        |  2 +-
 .../nlp/sbert_for_token_classification.py     |  6 ++--
 .../pipelines/nlp/fill_mask_pipeline.py       | 28 ++++++++-------
 modelscope/pipelines/nlp/nli_pipeline.py      | 36 +++++++++----------
 .../nlp/sentence_similarity_pipeline.py       |  8 +++--
 .../nlp/sentiment_classification_pipeline.py  | 27 ++++++--------
 .../pipelines/nlp/text_generation_pipeline.py | 12 +++----
 .../nlp/word_segmentation_pipeline.py         | 17 ++++-----
 .../nlp/zero_shot_classification_pipeline.py  | 13 +++----
 modelscope/preprocessors/nlp.py               | 24 ++++++-------
 12 files changed, 100 insertions(+), 91 deletions(-)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index be965aaa..a8677c16 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -46,6 +46,10 @@ class Pipelines(object):
     word_segmentation = 'word-segmentation'
     text_generation = 'text-generation'
     sentiment_analysis = 'sentiment-analysis'
+    sentiment_classification = "sentiment-classification"
+    zero_shot_classification = "zero-shot-classification"
+    fill_mask = "fill-mask"
+    nli = "nli"
 
     # audio tasks
     sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
@@ -85,10 +89,10 @@ class Preprocessors(object):
     # nlp preprocessor
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
-    sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
-    sbert_nli_tokenizer = 'sbert-nli-tokenizer'
-    sbert_sen_cls_tokenizer = 'sbert-sen-cls-tokenizer'
-    sbert_zero_shot_cls_tokenizer = 'sbert-zero-shot-cls-tokenizer'
+    token_cls_tokenizer = 'token-cls-tokenizer'
+    nli_tokenizer = 'nli-tokenizer'
+    sen_cls_tokenizer = 'sen-cls-tokenizer'
+    zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/nlp/masked_language_model.py b/modelscope/models/nlp/masked_language_model.py
index fe3918aa..4138da94 100644
--- a/modelscope/models/nlp/masked_language_model.py
+++ b/modelscope/models/nlp/masked_language_model.py
@@ -19,6 +19,12 @@ class MaskedLMModelBase(Model):
     def build_model(self):
         raise NotImplementedError()
 
+    @property
+    def config(self):
+        if hasattr(self.model, "config"):
+            return self.model.config
+        return None
+
     def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
         """return the result by the model
 
diff --git a/modelscope/models/nlp/sbert_for_nli.py b/modelscope/models/nlp/sbert_for_nli.py
index 2e854317..e41bfb91 100644
--- a/modelscope/models/nlp/sbert_for_nli.py
+++ b/modelscope/models/nlp/sbert_for_nli.py
@@ -1,4 +1,4 @@
-from modelscope.utils.constant import Tasks
+from ...utils.constant import Tasks
 from .sbert_for_sequence_classification import SbertForSequenceClassificationBase
 from ..builder import MODELS
 from ...metainfo import Models
diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py
index 36cdf78c..1ec848fb 100644
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -2,18 +2,17 @@ from typing import Any, Dict, Union
 
 import numpy as np
 import torch
-from sofa import SbertConfig, SbertForTokenClassification
 
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
 
-__all__ = ['StructBertForTokenClassification']
+__all__ = ['SbertForTokenClassification']
 
 
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
-class StructBertForTokenClassification(Model):
+class SbertForTokenClassification(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the word segmentation model from the `model_dir` path.
@@ -25,6 +24,7 @@ class StructBertForTokenClassification(Model):
         """
         super().__init__(model_dir, *args, **kwargs)
         self.model_dir = model_dir
+        from sofa import SbertConfig, SbertForTokenClassification
         self.model = SbertForTokenClassification.from_pretrained(
             self.model_dir)
         self.config = SbertConfig.from_pretrained(self.model_dir)
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index d7c1d456..ebf0e872 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,38 +1,41 @@
 from typing import Dict, Optional, Union
 
-from modelscope.models import Model
-from modelscope.models.nlp.masked_language_model import \
-    AliceMindBaseForMaskedLM
-from modelscope.preprocessors import FillMaskPreprocessor
-from modelscope.utils.constant import Tasks
+from ...models import Model
+from ...models.nlp.masked_language_model import \
+    MaskedLMModelBase
+from ...preprocessors import FillMaskPreprocessor
+from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
+from ...metainfo import Pipelines
 
 __all__ = ['FillMaskPipeline']
 
 
-@PIPELINES.register_module(Tasks.fill_mask, module_name=r'sbert')
-@PIPELINES.register_module(Tasks.fill_mask, module_name=r'veco')
+@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
 class FillMaskPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[AliceMindBaseForMaskedLM, str],
+                 model: Union[MaskedLMModelBase, str],
                  preprocessor: Optional[FillMaskPreprocessor] = None,
+                 first_sequence="sentense",
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
 
         Args:
-            model (AliceMindBaseForMaskedLM): a model instance
+            model (MaskedLMModelBase): a model instance
             preprocessor (FillMaskPreprocessor): a preprocessor instance
         """
         fill_mask_model = model if isinstance(
-            model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
+            model, MaskedLMModelBase) else Model.from_pretrained(model)
+        assert fill_mask_model.config is not None
+
         if preprocessor is None:
             preprocessor = FillMaskPreprocessor(
                 fill_mask_model.model_dir,
-                first_sequence='sentence',
+                first_sequence=first_sequence,
                 second_sequence=None)
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        super().__init__(model=fill_mask_model, preprocessor=preprocessor, **kwargs)
         self.preprocessor = preprocessor
         self.tokenizer = preprocessor.tokenizer
         self.mask_id = {'veco': 250001, 'sbert': 103}
@@ -82,6 +85,7 @@ class FillMaskPipeline(Pipeline):
 
         pred_strings = []
         for ids in rst_ids:  # batch
+            # TODO vocab size is not stable
             if self.model.config.vocab_size == 21128:  # zh bert
                 pred_string = self.tokenizer.convert_ids_to_tokens(ids)
                 pred_string = ''.join(pred_string)
diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py
index 135f826a..fbeb628d 100644
--- a/modelscope/pipelines/nlp/nli_pipeline.py
+++ b/modelscope/pipelines/nlp/nli_pipeline.py
@@ -1,27 +1,31 @@
-import os
 import uuid
 from typing import Any, Dict, Union
 
-import json
+import uuid
+from typing import Any, Dict, Union
+
 import numpy as np
 
-from modelscope.models.nlp import SbertForNLI
-from modelscope.preprocessors import NLIPreprocessor
-from modelscope.utils.constant import Tasks
-from ...models import Model
-from ..base import Input, Pipeline
+from ..base import Pipeline
 from ..builder import PIPELINES
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForNLI
+from ...preprocessors import NLIPreprocessor
+from ...utils.constant import Tasks
 
 __all__ = ['NLIPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.nli, module_name=r'nlp_structbert_nli_chinese-base')
+    Tasks.nli, module_name=Pipelines.nli)
 class NLIPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[SbertForNLI, str],
                  preprocessor: NLIPreprocessor = None,
+                 first_sequence="first_sequence",
+                 second_sequence="second_sequence",
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
 
@@ -36,20 +40,12 @@ class NLIPipeline(Pipeline):
         if preprocessor is None:
             preprocessor = NLIPreprocessor(
                 sc_model.model_dir,
-                first_sequence='first_sequence',
-                second_sequence='second_sequence')
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
         super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
+        assert len(sc_model.id2label) > 0
 
-        self.label_path = os.path.join(sc_model.model_dir,
-                                       'label_mapping.json')
-        with open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.label_id_to_name = {
-            idx: name
-            for name, idx in self.label_mapping.items()
-        }
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
index 95e78260..652c4bfb 100644
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -20,6 +20,8 @@ class SentenceSimilarityPipeline(Pipeline):
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: SequenceClassificationPreprocessor = None,
+                 first_sequence="first_sequence",
+                 second_sequence="second_sequence",
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
 
@@ -35,14 +37,14 @@ class SentenceSimilarityPipeline(Pipeline):
         if preprocessor is None:
             preprocessor = SequenceClassificationPreprocessor(
                 sc_model.model_dir,
-                first_sequence='first_sequence',
-                second_sequence='second_sequence')
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
         super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
 
         assert hasattr(self.model, 'id2label'), \
             'id2label map should be initalizaed in init function.'
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
index 818c792d..62a30f8f 100644
--- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
@@ -5,24 +5,27 @@ from typing import Any, Dict, Union
 import json
 import numpy as np
 
-from modelscope.models.nlp import SbertForSentimentClassification
-from modelscope.preprocessors import SentimentClassificationPreprocessor
-from modelscope.utils.constant import Tasks
+from ...models.nlp import SbertForSentimentClassification
+from ...preprocessors import SentimentClassificationPreprocessor
+from ...utils.constant import Tasks
 from ...models import Model
 from ..base import Input, Pipeline
 from ..builder import PIPELINES
+from ...metainfo import Pipelines
 
 __all__ = ['SentimentClassificationPipeline']
 
 
 @PIPELINES.register_module(
     Tasks.sentiment_classification,
-    module_name=r'sbert-sentiment-classification')
+    module_name=Pipelines.sentiment_classification)
 class SentimentClassificationPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[SbertForSentimentClassification, str],
                  preprocessor: SentimentClassificationPreprocessor = None,
+                 first_sequence="first_sequence",
+                 second_sequence="second_sequence",
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
 
@@ -38,20 +41,12 @@ class SentimentClassificationPipeline(Pipeline):
         if preprocessor is None:
             preprocessor = SentimentClassificationPreprocessor(
                 sc_model.model_dir,
-                first_sequence='first_sequence',
-                second_sequence='second_sequence')
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
         super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
+        assert len(sc_model.id2label) > 0
 
-        self.label_path = os.path.join(sc_model.model_dir,
-                                       'label_mapping.json')
-        with open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.label_id_to_name = {
-            idx: name
-            for name, idx in self.label_mapping.items()
-        }
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index ebd4be8e..6efc8de9 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,10 +1,10 @@
 from typing import Dict, Optional, Union
 
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import PalmForTextGeneration
-from modelscope.preprocessors import TextGenerationPreprocessor
-from modelscope.utils.constant import Tasks
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import PalmForTextGeneration
+from ...preprocessors import TextGenerationPreprocessor
+from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 
@@ -36,7 +36,7 @@ class TextGenerationPipeline(Pipeline):
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.tokenizer = model.tokenizer
 
-    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
+    def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index a45dafc3..70fcc7aa 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,10 +1,10 @@
 from typing import Any, Dict, Optional, Union
 
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import StructBertForTokenClassification
-from modelscope.preprocessors import TokenClassifcationPreprocessor
-from modelscope.utils.constant import Tasks
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForTokenClassification
+from ...preprocessors import TokenClassifcationPreprocessor
+from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 
@@ -16,7 +16,7 @@ __all__ = ['WordSegmentationPipeline']
 class WordSegmentationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[StructBertForTokenClassification, str],
+                 model: Union[SbertForTokenClassification, str],
                  preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
@@ -27,15 +27,16 @@ class WordSegmentationPipeline(Pipeline):
         """
         model = model if isinstance(
             model,
-            StructBertForTokenClassification) else Model.from_pretrained(model)
+            SbertForTokenClassification) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = TokenClassifcationPreprocessor(model.model_dir)
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.tokenizer = preprocessor.tokenizer
         self.config = model.config
+        assert len(self.config.id2label) > 0
         self.id2label = self.config.id2label
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index e703464a..5753324b 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -6,10 +6,11 @@ import json
 import numpy as np
 from scipy.special import softmax
 
-from modelscope.models.nlp import SbertForZeroShotClassification
-from modelscope.preprocessors import SbertZeroShotClassificationPreprocessor
-from modelscope.utils.constant import Tasks
+from ...models.nlp import SbertForZeroShotClassification
+from ...preprocessors import ZeroShotClassificationPreprocessor
+from ...utils.constant import Tasks
 from ...models import Model
+from ...metainfo import Pipelines
 from ..base import Input, Pipeline
 from ..builder import PIPELINES
 
@@ -18,12 +19,12 @@ __all__ = ['ZeroShotClassificationPipeline']
 
 @PIPELINES.register_module(
     Tasks.zero_shot_classification,
-    module_name=r'bert-zero-shot-classification')
+    module_name=Pipelines.zero_shot_classification)
 class ZeroShotClassificationPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[SbertForZeroShotClassification, str],
-                 preprocessor: SbertZeroShotClassificationPreprocessor = None,
+                 preprocessor: ZeroShotClassificationPreprocessor = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
 
@@ -32,7 +33,7 @@ class ZeroShotClassificationPipeline(Pipeline):
             preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
         """
         assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
-            'model must be a single str or BertForZeroShotClassification'
+            'model must be a single str or SbertForZeroShotClassification'
         sc_model = model if isinstance(
             model,
             SbertForZeroShotClassification) else Model.from_pretrained(model)
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 26cd79d8..d19b4f20 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -14,9 +14,9 @@ from .builder import PREPROCESSORS
 
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
-    'PalmTextGenerationPreprocessor', 'SbertZeroShotClassificationPreprocessor',
-    'SbertTokenClassifcationPreprocessor', 'SbertNLIPreprocessor',
-    'SbertSentimentClassificationPreprocessor', 'FillMaskPreprocessor'
+    'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor',
+    'TokenClassifcationPreprocessor', 'NLIPreprocessor',
+    'SentimentClassificationPreprocessor', 'FillMaskPreprocessor'
 ]
 
 
@@ -35,8 +35,8 @@ class Tokenize(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sbert_nli_tokenizer)
-class SbertNLIPreprocessor(Preprocessor):
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+class NLIPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path
@@ -105,8 +105,8 @@ class SbertNLIPreprocessor(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sbert_sen_cls_tokenizer)
-class SbertSentimentClassificationPreprocessor(Preprocessor):
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SentimentClassificationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path
@@ -264,7 +264,7 @@ class SequenceClassificationPreprocessor(Preprocessor):
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer)
-class PalmTextGenerationPreprocessor(Preprocessor):
+class TextGenerationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
         """preprocess the data using the vocab.txt from the `model_dir` path
@@ -374,8 +374,8 @@ class FillMaskPreprocessor(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sbert_zero_shot_cls_tokenizer)
-class SbertZeroShotClassificationPreprocessor(Preprocessor):
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path
@@ -418,8 +418,8 @@ class SbertZeroShotClassificationPreprocessor(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sbert_token_cls_tokenizer)
-class SbertTokenClassifcationPreprocessor(Preprocessor):
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+class TokenClassifcationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path