[to #42322933] Redo an unmerged CR:https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9427959

1. Redo a CR in current code 2. Refactor sbert's model configs Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9643861
3 years ago · 845cc869ca
--- a/modelscope/models/nlp/structbert/modeling_sbert.py
+++ b/modelscope/models/nlp/structbert/modeling_sbert.py
@@ -53,7 +53,7 @@ from .configuration_sbert import SbertConfig

 logger = get_logger(__name__)

 _CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512'
 _CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std'
 _CONFIG_FOR_DOC = 'SbertConfig'
 _TOKENIZER_FOR_DOC = 'SbertTokenizer'

--- a/modelscope/models/nlp/structbert/tokenization_sbert.py
+++ b/modelscope/models/nlp/structbert/tokenization_sbert.py
@@ -32,8 +32,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'chinese_sbert-large-std-512': 512,
    'english_sbert-large-std-512': 512,
    'nlp_structbert_backbone_large_std': 512,
    'nlp_structbert_backbone_base_std': 512,
    'nlp_structbert_backbone_lite_std': 512,
    'nlp_structbert_backbone_tiny_std': 512,
 }

 PRETRAINED_INIT_CONFIGURATION = {
--- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
+++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
@@ -38,8 +38,10 @@ PRETRAINED_VOCAB_FILES_MAP = {
 }

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'chinese_sbert-large-std-512': 512,
    'english_sbert-large-std-512': 512,
    'nlp_structbert_backbone_large_std': 512,
    'nlp_structbert_backbone_base_std': 512,
    'nlp_structbert_backbone_lite_std': 512,
    'nlp_structbert_backbone_tiny_std': 512,
 }

 PRETRAINED_INIT_CONFIGURATION = {
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -37,7 +37,8 @@ class FillMaskPipeline(Pipeline):
            preprocessor = FillMaskPreprocessor(
                fill_mask_model.model_dir,
                first_sequence=first_sequence,
                second_sequence=None)
                second_sequence=None,
                sequence_length=kwargs.pop('sequence_length', 128))
        fill_mask_model.eval()
        super().__init__(
            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -26,7 +26,9 @@ class NamedEntityRecognitionPipeline(Pipeline):
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = NERPreprocessor(model.model_dir)
            preprocessor = NERPreprocessor(
                model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 512))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
--- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
@@ -33,5 +33,6 @@ class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
            preprocessor = PairSentenceClassificationPreprocessor(
                model.model_dir if isinstance(model, Model) else model,
                first_sequence=first_sequence,
                second_sequence=second_sequence)
                second_sequence=second_sequence,
                sequence_length=kwargs.pop('sequence_length', 512))
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -37,7 +37,8 @@ class SequenceClassificationPipeline(Pipeline):
            preprocessor = SequenceClassificationPreprocessor(
                sc_model.model_dir,
                first_sequence='sentence',
                second_sequence=None)
                second_sequence=None,
                sequence_length=kwargs.pop('sequence_length', 512))
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

        assert hasattr(self.model, 'id2label'), \
--- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
@@ -31,5 +31,6 @@ class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
        if preprocessor is None:
            preprocessor = SingleSentenceClassificationPreprocessor(
                model.model_dir if isinstance(model, Model) else model,
                first_sequence=first_sequence)
                first_sequence=first_sequence,
                sequence_length=kwargs.pop('sequence_length', 512))
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -32,7 +32,8 @@ class TextGenerationPipeline(Pipeline):
            preprocessor = TextGenerationPreprocessor(
                model.model_dir,
                first_sequence='sentence',
                second_sequence=None)
                second_sequence=None,
                sequence_length=kwargs.pop('sequence_length', 128))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -31,7 +31,9 @@ class WordSegmentationPipeline(Pipeline):
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TokenClassificationPreprocessor(model.model_dir)
            preprocessor = TokenClassificationPreprocessor(
                model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 128))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.id2label = kwargs.get('id2label')
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -36,7 +36,9 @@ class ZeroShotClassificationPipeline(Pipeline):
        self.entailment_id = 0
        self.contradiction_id = 2
        if preprocessor is None:
            preprocessor = ZeroShotClassificationPreprocessor(model.model_dir)
            preprocessor = ZeroShotClassificationPreprocessor(
                model.model_dir,
                sequence_length=kwargs.pop('sequence_length', 512))
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -216,7 +216,7 @@ class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
        kwargs['truncation'] = kwargs.get('truncation', True)
        kwargs['padding'] = kwargs.get(
            'padding', False if mode == 'inference' else 'max_length')
            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
        super().__init__(model_dir, pair=True, mode=mode, **kwargs)

@@ -228,7 +228,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
        kwargs['truncation'] = kwargs.get('truncation', True)
        kwargs['padding'] = kwargs.get(
            'padding', False if mode == 'inference' else 'max_length')
            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
        super().__init__(model_dir, pair=False, mode=mode, **kwargs)

@@ -309,7 +309,7 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
        return super().build_tokenizer(model_dir)

    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
        if self._mode == 'inference':
        if self._mode == ModeKeys.INFERENCE:
            return super().__call__(data)
        src_txt = data['src_txt']
        tgt_txt = data['tgt_txt']
@@ -420,6 +420,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
        elif isinstance(data, dict):
            text_a = data.get(self.first_sequence)
            labels_list = data.get(self.label)
        text_a = text_a.replace(' ', '').strip()
        tokenized_inputs = self.tokenizer(
            text_a,
            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level


 class SentimentClassificationTest(unittest.TestCase):
    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-tiny'
    sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')