diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py index bbac3c95..10c0821c 100755 --- a/modelscope/models/nlp/structbert/modeling_sbert.py +++ b/modelscope/models/nlp/structbert/modeling_sbert.py @@ -53,7 +53,7 @@ from .configuration_sbert import SbertConfig logger = get_logger(__name__) -_CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512' +_CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std' _CONFIG_FOR_DOC = 'SbertConfig' _TOKENIZER_FOR_DOC = 'SbertTokenizer' diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization_sbert.py index 6db69509..cbf98746 100644 --- a/modelscope/models/nlp/structbert/tokenization_sbert.py +++ b/modelscope/models/nlp/structbert/tokenization_sbert.py @@ -32,8 +32,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}} PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'chinese_sbert-large-std-512': 512, - 'english_sbert-large-std-512': 512, + 'nlp_structbert_backbone_large_std': 512, + 'nlp_structbert_backbone_base_std': 512, + 'nlp_structbert_backbone_lite_std': 512, + 'nlp_structbert_backbone_tiny_std': 512, } PRETRAINED_INIT_CONFIGURATION = { diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py index b02039c6..5b8d79cc 100644 --- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py +++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py @@ -38,8 +38,10 @@ PRETRAINED_VOCAB_FILES_MAP = { } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'chinese_sbert-large-std-512': 512, - 'english_sbert-large-std-512': 512, + 'nlp_structbert_backbone_large_std': 512, + 'nlp_structbert_backbone_base_std': 512, + 'nlp_structbert_backbone_lite_std': 512, + 'nlp_structbert_backbone_tiny_std': 512, } PRETRAINED_INIT_CONFIGURATION = { diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py index e4affe40..644db597 100644 --- a/modelscope/pipelines/nlp/fill_mask_pipeline.py +++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py @@ -37,7 +37,8 @@ class FillMaskPipeline(Pipeline): preprocessor = FillMaskPreprocessor( fill_mask_model.model_dir, first_sequence=first_sequence, - second_sequence=None) + second_sequence=None, + sequence_length=kwargs.pop('sequence_length', 128)) fill_mask_model.eval() super().__init__( model=fill_mask_model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 663d59a4..4ea2f45d 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -26,7 +26,9 @@ class NamedEntityRecognitionPipeline(Pipeline): model = model if isinstance(model, Model) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = NERPreprocessor(model.model_dir) + preprocessor = NERPreprocessor( + model.model_dir, + sequence_length=kwargs.pop('sequence_length', 512)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.tokenizer = preprocessor.tokenizer diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py index 0804ec8c..d0329da8 100644 --- a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py @@ -33,5 +33,6 @@ class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase): preprocessor = PairSentenceClassificationPreprocessor( model.model_dir if isinstance(model, Model) else model, first_sequence=first_sequence, - second_sequence=second_sequence) + second_sequence=second_sequence, + sequence_length=kwargs.pop('sequence_length', 512)) super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py index 5273ddc6..7fe8aace 100644 --- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py @@ -37,7 +37,8 @@ class SequenceClassificationPipeline(Pipeline): preprocessor = SequenceClassificationPreprocessor( sc_model.model_dir, first_sequence='sentence', - second_sequence=None) + second_sequence=None, + sequence_length=kwargs.pop('sequence_length', 512)) super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) assert hasattr(self.model, 'id2label'), \ diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py index 8e0b4fe0..cc91ddf2 100644 --- a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py +++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py @@ -31,5 +31,6 @@ class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase): if preprocessor is None: preprocessor = SingleSentenceClassificationPreprocessor( model.model_dir if isinstance(model, Model) else model, - first_sequence=first_sequence) + first_sequence=first_sequence, + sequence_length=kwargs.pop('sequence_length', 512)) super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 287c98ff..8e9b36d5 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -32,7 +32,8 @@ class TextGenerationPipeline(Pipeline): preprocessor = TextGenerationPreprocessor( model.model_dir, first_sequence='sentence', - second_sequence=None) + second_sequence=None, + sequence_length=kwargs.pop('sequence_length', 128)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py index 06e6a31c..5d80ea22 100644 --- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py +++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py @@ -31,7 +31,9 @@ class WordSegmentationPipeline(Pipeline): model = model if isinstance(model, Model) else Model.from_pretrained(model) if preprocessor is None: - preprocessor = TokenClassificationPreprocessor(model.model_dir) + preprocessor = TokenClassificationPreprocessor( + model.model_dir, + sequence_length=kwargs.pop('sequence_length', 128)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) self.id2label = kwargs.get('id2label') diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py index d0dd2336..56ef0da0 100644 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py @@ -36,7 +36,9 @@ class ZeroShotClassificationPipeline(Pipeline): self.entailment_id = 0 self.contradiction_id = 2 if preprocessor is None: - preprocessor = ZeroShotClassificationPreprocessor(model.model_dir) + preprocessor = ZeroShotClassificationPreprocessor( + model.model_dir, + sequence_length=kwargs.pop('sequence_length', 512)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 5bd60dce..c4b73fb8 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -216,7 +216,7 @@ class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get( - 'padding', False if mode == 'inference' else 'max_length') + 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) super().__init__(model_dir, pair=True, mode=mode, **kwargs) @@ -228,7 +228,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase): def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs): kwargs['truncation'] = kwargs.get('truncation', True) kwargs['padding'] = kwargs.get( - 'padding', False if mode == 'inference' else 'max_length') + 'padding', False if mode == ModeKeys.INFERENCE else 'max_length') kwargs['max_length'] = kwargs.pop('sequence_length', 128) super().__init__(model_dir, pair=False, mode=mode, **kwargs) @@ -309,7 +309,7 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): return super().build_tokenizer(model_dir) def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: - if self._mode == 'inference': + if self._mode == ModeKeys.INFERENCE: return super().__call__(data) src_txt = data['src_txt'] tgt_txt = data['tgt_txt'] @@ -420,6 +420,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase): elif isinstance(data, dict): text_a = data.get(self.first_sequence) labels_list = data.get(self.label) + text_a = text_a.replace(' ', '').strip() tokenized_inputs = self.tokenizer( text_a, return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None, diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py index 82c068be..a623500d 100644 --- a/tests/pipelines/test_sentiment_classification.py +++ b/tests/pipelines/test_sentiment_classification.py @@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level class SentimentClassificationTest(unittest.TestCase): - model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base' + model_id = 'damo/nlp_structbert_sentiment-classification_chinese-tiny' sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')