Browse Source

[to #42322933] update ner default model & fix tokenizer bug

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744
master
pangda yingda.chen 3 years ago
parent
commit
55dfa0a856
4 changed files with 52 additions and 28 deletions
  1. +1
    -1
      modelscope/pipelines/builder.py
  2. +1
    -1
      modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
  3. +47
    -23
      modelscope/preprocessors/nlp.py
  4. +3
    -3
      tests/pipelines/test_named_entity_recognition.py

+ 1
- 1
modelscope/pipelines/builder.py View File

@@ -22,7 +22,7 @@ DEFAULT_MODEL_FOR_PIPELINE = {
'damo/nlp_structbert_word-segmentation_chinese-base'),
Tasks.named_entity_recognition:
(Pipelines.named_entity_recognition,
'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'),
'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
Tasks.sentence_similarity:
(Pipelines.sentence_similarity,
'damo/nlp_structbert_sentence-similarity_chinese-base'),


+ 1
- 1
modelscope/pipelines/nlp/named_entity_recognition_pipeline.py View File

@@ -42,7 +42,7 @@ class NamedEntityRecognitionPipeline(Pipeline):
def postprocess(self, inputs: Dict[str, Any],
**postprocess_params) -> Dict[str, str]:
text = inputs['text']
offset_mapping = inputs['offset_mapping']
offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
labels = [self.id2label[x] for x in inputs['predicts']]
entities = []
entity = {}


+ 47
- 23
modelscope/preprocessors/nlp.py View File

@@ -483,6 +483,8 @@ class NERPreprocessor(Preprocessor):
self.sequence_length = kwargs.pop('sequence_length', 512)
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, use_fast=True)
self.is_split_into_words = self.tokenizer.init_kwargs.get(
'is_split_into_words', False)

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:
@@ -499,29 +501,51 @@ class NERPreprocessor(Preprocessor):

# preprocess the data for the model input
text = data
encodings = self.tokenizer(
text,
add_special_tokens=True,
padding=True,
truncation=True,
max_length=self.sequence_length,
return_offsets_mapping=True)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
word_ids = encodings.word_ids()
label_mask = []
offset_mapping = []
for i in range(len(word_ids)):
if word_ids[i] is None:
label_mask.append(0)
elif word_ids[i] == word_ids[i - 1]:
label_mask.append(0)
offset_mapping[-1] = (offset_mapping[-1][0],
encodings['offset_mapping'][i][1])
else:
label_mask.append(1)
offset_mapping.append(encodings['offset_mapping'][i])

if self.is_split_into_words:
input_ids = []
label_mask = []
offset_mapping = []
for offset, token in enumerate(list(data)):
subtoken_ids = self.tokenizer.encode(
token, add_special_tokens=False)
if len(subtoken_ids) == 0:
subtoken_ids = [self.tokenizer.unk_token_id]
input_ids.extend(subtoken_ids)
label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
offset_mapping.extend([(offset, offset + 1)]
+ [(offset + 1, offset + 1)]
* (len(subtoken_ids) - 1))
if len(input_ids) >= self.sequence_length - 2:
input_ids = input_ids[:self.sequence_length - 2]
label_mask = label_mask[:self.sequence_length - 2]
offset_mapping = offset_mapping[:self.sequence_length - 2]
input_ids = [self.tokenizer.cls_token_id
] + input_ids + [self.tokenizer.sep_token_id]
label_mask = [0] + label_mask + [0]
attention_mask = [1] * len(input_ids)
else:
encodings = self.tokenizer(
text,
add_special_tokens=True,
padding=True,
truncation=True,
max_length=self.sequence_length,
return_offsets_mapping=True)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
word_ids = encodings.word_ids()
label_mask = []
offset_mapping = []
for i in range(len(word_ids)):
if word_ids[i] is None:
label_mask.append(0)
elif word_ids[i] == word_ids[i - 1]:
label_mask.append(0)
offset_mapping[-1] = (offset_mapping[-1][0],
encodings['offset_mapping'][i][1])
else:
label_mask.append(1)
offset_mapping.append(encodings['offset_mapping'][i])
return {
'text': text,
'input_ids': input_ids,


+ 3
- 3
tests/pipelines/test_named_entity_recognition.py View File

@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level


class NamedEntityRecognitionTest(unittest.TestCase):
model_id = 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'
model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
sentence = '这与温岭市新河镇的一个神秘的传说有关。'

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -32,7 +32,7 @@ class NamedEntityRecognitionTest(unittest.TestCase):
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
model = Model.from_pretrained(self.model_id)
tokenizer = NERPreprocessor(model.model_dir)
@@ -42,7 +42,7 @@ class NamedEntityRecognitionTest(unittest.TestCase):
preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.model_id)


Loading…
Cancel
Save