[to #42322933] update ner default model & fix tokenizer bug

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744
3 years ago · 55dfa0a856
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -22,7 +22,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     'damo/nlp_structbert_word-segmentation_chinese-base'),
    Tasks.named_entity_recognition:
    (Pipelines.named_entity_recognition,
     'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'),
     'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
    Tasks.sentence_similarity:
    (Pipelines.sentence_similarity,
     'damo/nlp_structbert_sentence-similarity_chinese-base'),
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -42,7 +42,7 @@ class NamedEntityRecognitionPipeline(Pipeline):
    def postprocess(self, inputs: Dict[str, Any],
                    **postprocess_params) -> Dict[str, str]:
        text = inputs['text']
        offset_mapping = inputs['offset_mapping']
        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
        labels = [self.id2label[x] for x in inputs['predicts']]
        entities = []
        entity = {}
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -483,6 +483,8 @@ class NERPreprocessor(Preprocessor):
        self.sequence_length = kwargs.pop('sequence_length', 512)
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_dir, use_fast=True)
        self.is_split_into_words = self.tokenizer.init_kwargs.get(
            'is_split_into_words', False)

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
@@ -499,29 +501,51 @@ class NERPreprocessor(Preprocessor):

        # preprocess the data for the model input
        text = data
        encodings = self.tokenizer(
            text,
            add_special_tokens=True,
            padding=True,
            truncation=True,
            max_length=self.sequence_length,
            return_offsets_mapping=True)
        input_ids = encodings['input_ids']
        attention_mask = encodings['attention_mask']
        word_ids = encodings.word_ids()
        label_mask = []
        offset_mapping = []
        for i in range(len(word_ids)):
            if word_ids[i] is None:
                label_mask.append(0)
            elif word_ids[i] == word_ids[i - 1]:
                label_mask.append(0)
                offset_mapping[-1] = (offset_mapping[-1][0],
                                      encodings['offset_mapping'][i][1])
            else:
                label_mask.append(1)
                offset_mapping.append(encodings['offset_mapping'][i])

        if self.is_split_into_words:
            input_ids = []
            label_mask = []
            offset_mapping = []
            for offset, token in enumerate(list(data)):
                subtoken_ids = self.tokenizer.encode(
                    token, add_special_tokens=False)
                if len(subtoken_ids) == 0:
                    subtoken_ids = [self.tokenizer.unk_token_id]
                input_ids.extend(subtoken_ids)
                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
                offset_mapping.extend([(offset, offset + 1)]
                                      + [(offset + 1, offset + 1)]
                                      * (len(subtoken_ids) - 1))
            if len(input_ids) >= self.sequence_length - 2:
                input_ids = input_ids[:self.sequence_length - 2]
                label_mask = label_mask[:self.sequence_length - 2]
                offset_mapping = offset_mapping[:self.sequence_length - 2]
            input_ids = [self.tokenizer.cls_token_id
                         ] + input_ids + [self.tokenizer.sep_token_id]
            label_mask = [0] + label_mask + [0]
            attention_mask = [1] * len(input_ids)
        else:
            encodings = self.tokenizer(
                text,
                add_special_tokens=True,
                padding=True,
                truncation=True,
                max_length=self.sequence_length,
                return_offsets_mapping=True)
            input_ids = encodings['input_ids']
            attention_mask = encodings['attention_mask']
            word_ids = encodings.word_ids()
            label_mask = []
            offset_mapping = []
            for i in range(len(word_ids)):
                if word_ids[i] is None:
                    label_mask.append(0)
                elif word_ids[i] == word_ids[i - 1]:
                    label_mask.append(0)
                    offset_mapping[-1] = (offset_mapping[-1][0],
                                          encodings['offset_mapping'][i][1])
                else:
                    label_mask.append(1)
                    offset_mapping.append(encodings['offset_mapping'][i])
        return {
            'text': text,
            'input_ids': input_ids,
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level


 class NamedEntityRecognitionTest(unittest.TestCase):
    model_id = 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'
    model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
    sentence = '这与温岭市新河镇的一个神秘的传说有关。'

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -32,7 +32,7 @@ class NamedEntityRecognitionTest(unittest.TestCase):
        print()
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.model_id)
        tokenizer = NERPreprocessor(model.model_dir)
@@ -42,7 +42,7 @@ class NamedEntityRecognitionTest(unittest.TestCase):
            preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.model_id)