Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744master
@@ -22,7 +22,7 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||||
'damo/nlp_structbert_word-segmentation_chinese-base'), | 'damo/nlp_structbert_word-segmentation_chinese-base'), | ||||
Tasks.named_entity_recognition: | Tasks.named_entity_recognition: | ||||
(Pipelines.named_entity_recognition, | (Pipelines.named_entity_recognition, | ||||
'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news'), | |||||
'damo/nlp_raner_named-entity-recognition_chinese-base-news'), | |||||
Tasks.sentence_similarity: | Tasks.sentence_similarity: | ||||
(Pipelines.sentence_similarity, | (Pipelines.sentence_similarity, | ||||
'damo/nlp_structbert_sentence-similarity_chinese-base'), | 'damo/nlp_structbert_sentence-similarity_chinese-base'), | ||||
@@ -42,7 +42,7 @@ class NamedEntityRecognitionPipeline(Pipeline): | |||||
def postprocess(self, inputs: Dict[str, Any], | def postprocess(self, inputs: Dict[str, Any], | ||||
**postprocess_params) -> Dict[str, str]: | **postprocess_params) -> Dict[str, str]: | ||||
text = inputs['text'] | text = inputs['text'] | ||||
offset_mapping = inputs['offset_mapping'] | |||||
offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']] | |||||
labels = [self.id2label[x] for x in inputs['predicts']] | labels = [self.id2label[x] for x in inputs['predicts']] | ||||
entities = [] | entities = [] | ||||
entity = {} | entity = {} | ||||
@@ -483,6 +483,8 @@ class NERPreprocessor(Preprocessor): | |||||
self.sequence_length = kwargs.pop('sequence_length', 512) | self.sequence_length = kwargs.pop('sequence_length', 512) | ||||
self.tokenizer = AutoTokenizer.from_pretrained( | self.tokenizer = AutoTokenizer.from_pretrained( | ||||
model_dir, use_fast=True) | model_dir, use_fast=True) | ||||
self.is_split_into_words = self.tokenizer.init_kwargs.get( | |||||
'is_split_into_words', False) | |||||
@type_assert(object, str) | @type_assert(object, str) | ||||
def __call__(self, data: str) -> Dict[str, Any]: | def __call__(self, data: str) -> Dict[str, Any]: | ||||
@@ -499,29 +501,51 @@ class NERPreprocessor(Preprocessor): | |||||
# preprocess the data for the model input | # preprocess the data for the model input | ||||
text = data | text = data | ||||
encodings = self.tokenizer( | |||||
text, | |||||
add_special_tokens=True, | |||||
padding=True, | |||||
truncation=True, | |||||
max_length=self.sequence_length, | |||||
return_offsets_mapping=True) | |||||
input_ids = encodings['input_ids'] | |||||
attention_mask = encodings['attention_mask'] | |||||
word_ids = encodings.word_ids() | |||||
label_mask = [] | |||||
offset_mapping = [] | |||||
for i in range(len(word_ids)): | |||||
if word_ids[i] is None: | |||||
label_mask.append(0) | |||||
elif word_ids[i] == word_ids[i - 1]: | |||||
label_mask.append(0) | |||||
offset_mapping[-1] = (offset_mapping[-1][0], | |||||
encodings['offset_mapping'][i][1]) | |||||
else: | |||||
label_mask.append(1) | |||||
offset_mapping.append(encodings['offset_mapping'][i]) | |||||
if self.is_split_into_words: | |||||
input_ids = [] | |||||
label_mask = [] | |||||
offset_mapping = [] | |||||
for offset, token in enumerate(list(data)): | |||||
subtoken_ids = self.tokenizer.encode( | |||||
token, add_special_tokens=False) | |||||
if len(subtoken_ids) == 0: | |||||
subtoken_ids = [self.tokenizer.unk_token_id] | |||||
input_ids.extend(subtoken_ids) | |||||
label_mask.extend([1] + [0] * (len(subtoken_ids) - 1)) | |||||
offset_mapping.extend([(offset, offset + 1)] | |||||
+ [(offset + 1, offset + 1)] | |||||
* (len(subtoken_ids) - 1)) | |||||
if len(input_ids) >= self.sequence_length - 2: | |||||
input_ids = input_ids[:self.sequence_length - 2] | |||||
label_mask = label_mask[:self.sequence_length - 2] | |||||
offset_mapping = offset_mapping[:self.sequence_length - 2] | |||||
input_ids = [self.tokenizer.cls_token_id | |||||
] + input_ids + [self.tokenizer.sep_token_id] | |||||
label_mask = [0] + label_mask + [0] | |||||
attention_mask = [1] * len(input_ids) | |||||
else: | |||||
encodings = self.tokenizer( | |||||
text, | |||||
add_special_tokens=True, | |||||
padding=True, | |||||
truncation=True, | |||||
max_length=self.sequence_length, | |||||
return_offsets_mapping=True) | |||||
input_ids = encodings['input_ids'] | |||||
attention_mask = encodings['attention_mask'] | |||||
word_ids = encodings.word_ids() | |||||
label_mask = [] | |||||
offset_mapping = [] | |||||
for i in range(len(word_ids)): | |||||
if word_ids[i] is None: | |||||
label_mask.append(0) | |||||
elif word_ids[i] == word_ids[i - 1]: | |||||
label_mask.append(0) | |||||
offset_mapping[-1] = (offset_mapping[-1][0], | |||||
encodings['offset_mapping'][i][1]) | |||||
else: | |||||
label_mask.append(1) | |||||
offset_mapping.append(encodings['offset_mapping'][i]) | |||||
return { | return { | ||||
'text': text, | 'text': text, | ||||
'input_ids': input_ids, | 'input_ids': input_ids, | ||||
@@ -12,7 +12,7 @@ from modelscope.utils.test_utils import test_level | |||||
class NamedEntityRecognitionTest(unittest.TestCase): | class NamedEntityRecognitionTest(unittest.TestCase): | ||||
model_id = 'damo/nlp_transformercrf_named-entity-recognition_chinese-base-news' | |||||
model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' | |||||
sentence = '这与温岭市新河镇的一个神秘的传说有关。' | sentence = '这与温岭市新河镇的一个神秘的传说有关。' | ||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | ||||
@@ -32,7 +32,7 @@ class NamedEntityRecognitionTest(unittest.TestCase): | |||||
print() | print() | ||||
print(f'pipeline2: {pipeline2(input=self.sentence)}') | print(f'pipeline2: {pipeline2(input=self.sentence)}') | ||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
def test_run_with_model_from_modelhub(self): | def test_run_with_model_from_modelhub(self): | ||||
model = Model.from_pretrained(self.model_id) | model = Model.from_pretrained(self.model_id) | ||||
tokenizer = NERPreprocessor(model.model_dir) | tokenizer = NERPreprocessor(model.model_dir) | ||||
@@ -42,7 +42,7 @@ class NamedEntityRecognitionTest(unittest.TestCase): | |||||
preprocessor=tokenizer) | preprocessor=tokenizer) | ||||
print(pipeline_ins(input=self.sentence)) | print(pipeline_ins(input=self.sentence)) | ||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level') | |||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
def test_run_with_model_name(self): | def test_run_with_model_name(self): | ||||
pipeline_ins = pipeline( | pipeline_ins = pipeline( | ||||
task=Tasks.named_entity_recognition, model=self.model_id) | task=Tasks.named_entity_recognition, model=self.model_id) | ||||