From 55dfa0a8568913fb18b3eed5861391b075e71a58 Mon Sep 17 00:00:00 2001 From: pangda Date: Thu, 4 Aug 2022 16:11:22 +0800 Subject: [PATCH] [to #42322933] update ner default model & fix tokenizer bug Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744 --- modelscope/pipelines/builder.py | 2 +- .../nlp/named_entity_recognition_pipeline.py | 2 +- modelscope/preprocessors/nlp.py | 70 +++++++++++++------ .../test_named_entity_recognition.py | 6 +- 4 files changed, 52 insertions(+), 28 deletions(-) diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 21bdd36c..28dd190a 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -22,7 +22,7 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/nlp_structbert_word-segmentation_chinese-base'), Tasks.named_entity_recognition: (Pipelines.named_entity_recognition, - 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'), + 'damo/nlp_raner_named-entity-recognition_chinese-base-news'), Tasks.sentence_similarity: (Pipelines.sentence_similarity, 'damo/nlp_structbert_sentence-similarity_chinese-base'), diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py index 29c439fc..663d59a4 100644 --- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py +++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py @@ -42,7 +42,7 @@ class NamedEntityRecognitionPipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]: text = inputs['text'] - offset_mapping = inputs['offset_mapping'] + offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] labels = [self.id2label[x] for x in inputs['predicts']] entities = [] entity = {} diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 58ad3dbe..5bd60dce 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -483,6 +483,8 @@ class NERPreprocessor(Preprocessor): self.sequence_length = kwargs.pop('sequence_length', 512) self.tokenizer = AutoTokenizer.from_pretrained( model_dir, use_fast=True) + self.is_split_into_words = self.tokenizer.init_kwargs.get( + 'is_split_into_words', False) @type_assert(object, str) def __call__(self, data: str) -> Dict[str, Any]: @@ -499,29 +501,51 @@ class NERPreprocessor(Preprocessor): # preprocess the data for the model input text = data - encodings = self.tokenizer( - text, - add_special_tokens=True, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_offsets_mapping=True) - input_ids = encodings['input_ids'] - attention_mask = encodings['attention_mask'] - word_ids = encodings.word_ids() - label_mask = [] - offset_mapping = [] - for i in range(len(word_ids)): - if word_ids[i] is None: - label_mask.append(0) - elif word_ids[i] == word_ids[i - 1]: - label_mask.append(0) - offset_mapping[-1] = (offset_mapping[-1][0], - encodings['offset_mapping'][i][1]) - else: - label_mask.append(1) - offset_mapping.append(encodings['offset_mapping'][i]) - + if self.is_split_into_words: + input_ids = [] + label_mask = [] + offset_mapping = [] + for offset, token in enumerate(list(data)): + subtoken_ids = self.tokenizer.encode( + token, add_special_tokens=False) + if len(subtoken_ids) == 0: + subtoken_ids = [self.tokenizer.unk_token_id] + input_ids.extend(subtoken_ids) + label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) + offset_mapping.extend([(offset, offset + 1)] + + [(offset + 1, offset + 1)] + * (len(subtoken_ids) - 1)) + if len(input_ids) >= self.sequence_length - 2: + input_ids = input_ids[:self.sequence_length - 2] + label_mask = label_mask[:self.sequence_length - 2] + offset_mapping = offset_mapping[:self.sequence_length - 2] + input_ids = [self.tokenizer.cls_token_id + ] + input_ids + [self.tokenizer.sep_token_id] + label_mask = [0] + label_mask + [0] + attention_mask = [1] * len(input_ids) + else: + encodings = self.tokenizer( + text, + add_special_tokens=True, + padding=True, + truncation=True, + max_length=self.sequence_length, + return_offsets_mapping=True) + input_ids = encodings['input_ids'] + attention_mask = encodings['attention_mask'] + word_ids = encodings.word_ids() + label_mask = [] + offset_mapping = [] + for i in range(len(word_ids)): + if word_ids[i] is None: + label_mask.append(0) + elif word_ids[i] == word_ids[i - 1]: + label_mask.append(0) + offset_mapping[-1] = (offset_mapping[-1][0], + encodings['offset_mapping'][i][1]) + else: + label_mask.append(1) + offset_mapping.append(encodings['offset_mapping'][i]) return { 'text': text, 'input_ids': input_ids, diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index 17708afe..21a62d80 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level class NamedEntityRecognitionTest(unittest.TestCase): - model_id = 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news' + model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' sentence = '这与温岭市新河镇的一个神秘的传说有关。' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') @@ -32,7 +32,7 @@ class NamedEntityRecognitionTest(unittest.TestCase): print() print(f'pipeline2: {pipeline2(input=self.sentence)}') - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) tokenizer = NERPreprocessor(model.model_dir) @@ -42,7 +42,7 @@ class NamedEntityRecognitionTest(unittest.TestCase): preprocessor=tokenizer) print(pipeline_ins(input=self.sentence)) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_name(self): pipeline_ins = pipeline( task=Tasks.named_entity_recognition, model=self.model_id)