From 55dfa0a8568913fb18b3eed5861391b075e71a58 Mon Sep 17 00:00:00 2001
From: pangda <pangda@alibaba-inc.com>
Date: Thu, 4 Aug 2022 16:11:22 +0800
Subject: [PATCH] [to #42322933] update ner default model & fix tokenizer bug  
       Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744

---
 modelscope/pipelines/builder.py               |  2 +-
 .../nlp/named_entity_recognition_pipeline.py  |  2 +-
 modelscope/preprocessors/nlp.py               | 70 +++++++++++++------
 .../test_named_entity_recognition.py          |  6 +-
 4 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 21bdd36c..28dd190a 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -22,7 +22,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_structbert_word-segmentation_chinese-base'),
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
-     'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'),
+     'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
     Tasks.sentence_similarity:
     (Pipelines.sentence_similarity,
      'damo/nlp_structbert_sentence-similarity_chinese-base'),
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 29c439fc..663d59a4 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -42,7 +42,7 @@ class NamedEntityRecognitionPipeline(Pipeline):
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
         text = inputs['text']
-        offset_mapping = inputs['offset_mapping']
+        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
         labels = [self.id2label[x] for x in inputs['predicts']]
         entities = []
         entity = {}
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 58ad3dbe..5bd60dce 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -483,6 +483,8 @@ class NERPreprocessor(Preprocessor):
         self.sequence_length = kwargs.pop('sequence_length', 512)
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_dir, use_fast=True)
+        self.is_split_into_words = self.tokenizer.init_kwargs.get(
+            'is_split_into_words', False)
 
     @type_assert(object, str)
     def __call__(self, data: str) -> Dict[str, Any]:
@@ -499,29 +501,51 @@ class NERPreprocessor(Preprocessor):
 
         # preprocess the data for the model input
         text = data
-        encodings = self.tokenizer(
-            text,
-            add_special_tokens=True,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            return_offsets_mapping=True)
-        input_ids = encodings['input_ids']
-        attention_mask = encodings['attention_mask']
-        word_ids = encodings.word_ids()
-        label_mask = []
-        offset_mapping = []
-        for i in range(len(word_ids)):
-            if word_ids[i] is None:
-                label_mask.append(0)
-            elif word_ids[i] == word_ids[i - 1]:
-                label_mask.append(0)
-                offset_mapping[-1] = (offset_mapping[-1][0],
-                                      encodings['offset_mapping'][i][1])
-            else:
-                label_mask.append(1)
-                offset_mapping.append(encodings['offset_mapping'][i])
-
+        if self.is_split_into_words:
+            input_ids = []
+            label_mask = []
+            offset_mapping = []
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)]
+                                      + [(offset + 1, offset + 1)]
+                                      * (len(subtoken_ids) - 1))
+            if len(input_ids) >= self.sequence_length - 2:
+                input_ids = input_ids[:self.sequence_length - 2]
+                label_mask = label_mask[:self.sequence_length - 2]
+                offset_mapping = offset_mapping[:self.sequence_length - 2]
+            input_ids = [self.tokenizer.cls_token_id
+                         ] + input_ids + [self.tokenizer.sep_token_id]
+            label_mask = [0] + label_mask + [0]
+            attention_mask = [1] * len(input_ids)
+        else:
+            encodings = self.tokenizer(
+                text,
+                add_special_tokens=True,
+                padding=True,
+                truncation=True,
+                max_length=self.sequence_length,
+                return_offsets_mapping=True)
+            input_ids = encodings['input_ids']
+            attention_mask = encodings['attention_mask']
+            word_ids = encodings.word_ids()
+            label_mask = []
+            offset_mapping = []
+            for i in range(len(word_ids)):
+                if word_ids[i] is None:
+                    label_mask.append(0)
+                elif word_ids[i] == word_ids[i - 1]:
+                    label_mask.append(0)
+                    offset_mapping[-1] = (offset_mapping[-1][0],
+                                          encodings['offset_mapping'][i][1])
+                else:
+                    label_mask.append(1)
+                    offset_mapping.append(encodings['offset_mapping'][i])
         return {
             'text': text,
             'input_ids': input_ids,
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 17708afe..21a62d80 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level
 
 
 class NamedEntityRecognitionTest(unittest.TestCase):
-    model_id = 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'
+    model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
     sentence = '这与温岭市新河镇的一个神秘的传说有关。'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -32,7 +32,7 @@ class NamedEntityRecognitionTest(unittest.TestCase):
         print()
         print(f'pipeline2: {pipeline2(input=self.sentence)}')
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
         tokenizer = NERPreprocessor(model.model_dir)
@@ -42,7 +42,7 @@ class NamedEntityRecognitionTest(unittest.TestCase):
             preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.named_entity_recognition, model=self.model_id)