Browse Source

[to #42322933] Redo an unmerged CR:https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9427959

1. Redo a CR in current code
2. Refactor sbert's model configs
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9643861
master
yuze.zyz yingda.chen 3 years ago
parent
commit
845cc869ca
13 changed files with 33 additions and 17 deletions
  1. +1
    -1
      modelscope/models/nlp/structbert/modeling_sbert.py
  2. +4
    -2
      modelscope/models/nlp/structbert/tokenization_sbert.py
  3. +4
    -2
      modelscope/models/nlp/structbert/tokenization_sbert_fast.py
  4. +2
    -1
      modelscope/pipelines/nlp/fill_mask_pipeline.py
  5. +3
    -1
      modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
  6. +2
    -1
      modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
  7. +2
    -1
      modelscope/pipelines/nlp/sequence_classification_pipeline.py
  8. +2
    -1
      modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
  9. +2
    -1
      modelscope/pipelines/nlp/text_generation_pipeline.py
  10. +3
    -1
      modelscope/pipelines/nlp/word_segmentation_pipeline.py
  11. +3
    -1
      modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
  12. +4
    -3
      modelscope/preprocessors/nlp.py
  13. +1
    -1
      tests/pipelines/test_sentiment_classification.py

+ 1
- 1
modelscope/models/nlp/structbert/modeling_sbert.py View File

@@ -53,7 +53,7 @@ from .configuration_sbert import SbertConfig

logger = get_logger(__name__)

_CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512'
_CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std'
_CONFIG_FOR_DOC = 'SbertConfig'
_TOKENIZER_FOR_DOC = 'SbertTokenizer'



+ 4
- 2
modelscope/models/nlp/structbert/tokenization_sbert.py View File

@@ -32,8 +32,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'chinese_sbert-large-std-512': 512,
'english_sbert-large-std-512': 512,
'nlp_structbert_backbone_large_std': 512,
'nlp_structbert_backbone_base_std': 512,
'nlp_structbert_backbone_lite_std': 512,
'nlp_structbert_backbone_tiny_std': 512,
}

PRETRAINED_INIT_CONFIGURATION = {


+ 4
- 2
modelscope/models/nlp/structbert/tokenization_sbert_fast.py View File

@@ -38,8 +38,10 @@ PRETRAINED_VOCAB_FILES_MAP = {
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'chinese_sbert-large-std-512': 512,
'english_sbert-large-std-512': 512,
'nlp_structbert_backbone_large_std': 512,
'nlp_structbert_backbone_base_std': 512,
'nlp_structbert_backbone_lite_std': 512,
'nlp_structbert_backbone_tiny_std': 512,
}

PRETRAINED_INIT_CONFIGURATION = {


+ 2
- 1
modelscope/pipelines/nlp/fill_mask_pipeline.py View File

@@ -37,7 +37,8 @@ class FillMaskPipeline(Pipeline):
preprocessor = FillMaskPreprocessor(
fill_mask_model.model_dir,
first_sequence=first_sequence,
second_sequence=None)
second_sequence=None,
sequence_length=kwargs.pop('sequence_length', 128))
fill_mask_model.eval()
super().__init__(
model=fill_mask_model, preprocessor=preprocessor, **kwargs)


+ 3
- 1
modelscope/pipelines/nlp/named_entity_recognition_pipeline.py View File

@@ -26,7 +26,9 @@ class NamedEntityRecognitionPipeline(Pipeline):
model = model if isinstance(model,
Model) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = NERPreprocessor(model.model_dir)
preprocessor = NERPreprocessor(
model.model_dir,
sequence_length=kwargs.pop('sequence_length', 512))
model.eval()
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = preprocessor.tokenizer


+ 2
- 1
modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py View File

@@ -33,5 +33,6 @@ class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
preprocessor = PairSentenceClassificationPreprocessor(
model.model_dir if isinstance(model, Model) else model,
first_sequence=first_sequence,
second_sequence=second_sequence)
second_sequence=second_sequence,
sequence_length=kwargs.pop('sequence_length', 512))
super().__init__(model=model, preprocessor=preprocessor, **kwargs)

+ 2
- 1
modelscope/pipelines/nlp/sequence_classification_pipeline.py View File

@@ -37,7 +37,8 @@ class SequenceClassificationPipeline(Pipeline):
preprocessor = SequenceClassificationPreprocessor(
sc_model.model_dir,
first_sequence='sentence',
second_sequence=None)
second_sequence=None,
sequence_length=kwargs.pop('sequence_length', 512))
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

assert hasattr(self.model, 'id2label'), \


+ 2
- 1
modelscope/pipelines/nlp/single_sentence_classification_pipeline.py View File

@@ -31,5 +31,6 @@ class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
if preprocessor is None:
preprocessor = SingleSentenceClassificationPreprocessor(
model.model_dir if isinstance(model, Model) else model,
first_sequence=first_sequence)
first_sequence=first_sequence,
sequence_length=kwargs.pop('sequence_length', 512))
super().__init__(model=model, preprocessor=preprocessor, **kwargs)

+ 2
- 1
modelscope/pipelines/nlp/text_generation_pipeline.py View File

@@ -32,7 +32,8 @@ class TextGenerationPipeline(Pipeline):
preprocessor = TextGenerationPreprocessor(
model.model_dir,
first_sequence='sentence',
second_sequence=None)
second_sequence=None,
sequence_length=kwargs.pop('sequence_length', 128))
model.eval()
super().__init__(model=model, preprocessor=preprocessor, **kwargs)



+ 3
- 1
modelscope/pipelines/nlp/word_segmentation_pipeline.py View File

@@ -31,7 +31,9 @@ class WordSegmentationPipeline(Pipeline):
model = model if isinstance(model,
Model) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = TokenClassificationPreprocessor(model.model_dir)
preprocessor = TokenClassificationPreprocessor(
model.model_dir,
sequence_length=kwargs.pop('sequence_length', 128))
model.eval()
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.id2label = kwargs.get('id2label')


+ 3
- 1
modelscope/pipelines/nlp/zero_shot_classification_pipeline.py View File

@@ -36,7 +36,9 @@ class ZeroShotClassificationPipeline(Pipeline):
self.entailment_id = 0
self.contradiction_id = 2
if preprocessor is None:
preprocessor = ZeroShotClassificationPreprocessor(model.model_dir)
preprocessor = ZeroShotClassificationPreprocessor(
model.model_dir,
sequence_length=kwargs.pop('sequence_length', 512))
model.eval()
super().__init__(model=model, preprocessor=preprocessor, **kwargs)



+ 4
- 3
modelscope/preprocessors/nlp.py View File

@@ -216,7 +216,7 @@ class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get(
'padding', False if mode == 'inference' else 'max_length')
'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
kwargs['max_length'] = kwargs.pop('sequence_length', 128)
super().__init__(model_dir, pair=True, mode=mode, **kwargs)

@@ -228,7 +228,7 @@ class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
kwargs['truncation'] = kwargs.get('truncation', True)
kwargs['padding'] = kwargs.get(
'padding', False if mode == 'inference' else 'max_length')
'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
kwargs['max_length'] = kwargs.pop('sequence_length', 128)
super().__init__(model_dir, pair=False, mode=mode, **kwargs)

@@ -309,7 +309,7 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
return super().build_tokenizer(model_dir)

def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
if self._mode == 'inference':
if self._mode == ModeKeys.INFERENCE:
return super().__call__(data)
src_txt = data['src_txt']
tgt_txt = data['tgt_txt']
@@ -420,6 +420,7 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
elif isinstance(data, dict):
text_a = data.get(self.first_sequence)
labels_list = data.get(self.label)
text_a = text_a.replace(' ', '').strip()
tokenized_inputs = self.tokenizer(
text_a,
return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,


+ 1
- 1
tests/pipelines/test_sentiment_classification.py View File

@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level


class SentimentClassificationTest(unittest.TestCase):
model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
model_id = 'damo/nlp_structbert_sentiment-classification_chinese-tiny'
sentence1 = '启动的时候很大声音,然后就会听到1.2秒的卡察的声音,类似齿轮摩擦的声音'

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')


Loading…
Cancel
Save