Browse Source

[to #42322933]feat: add nlp-chinese-bert-fill-mask-pipeline to maas_lib

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9155437
master
suluyan.sly 3 years ago
parent
commit
a7c1cd0fc9
4 changed files with 94 additions and 37 deletions
  1. +31
    -17
      modelscope/models/nlp/masked_language_model.py
  2. +21
    -15
      modelscope/pipelines/nlp/fill_mask_pipeline.py
  3. +7
    -4
      modelscope/preprocessors/nlp.py
  4. +35
    -1
      tests/pipelines/test_fill_mask.py

+ 31
- 17
modelscope/models/nlp/masked_language_model.py View File

@@ -2,24 +2,28 @@ from typing import Any, Dict, Optional, Union

import numpy as np

from modelscope.metainfo import Models
from modelscope.utils.constant import Tasks
from ...metainfo import Models
from ...utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['StructBertForMaskedLM', 'VecoForMaskedLM']
__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']


class AliceMindBaseForMaskedLM(Model):
class MaskedLanguageModelBase(Model):

def __init__(self, model_dir: str, *args, **kwargs):
from sofa.utils.backend import AutoConfig, AutoModelForMaskedLM
self.model_dir = model_dir
super().__init__(model_dir, *args, **kwargs)
self.model = self.build_model()

self.config = AutoConfig.from_pretrained(model_dir)
self.model = AutoModelForMaskedLM.from_pretrained(
model_dir, config=self.config)
def build_model():
raise NotImplementedError()

@property
def config(self):
if hasattr(self.model, 'config'):
return self.model.config
return None

def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
"""return the result by the model
@@ -38,14 +42,24 @@ class AliceMindBaseForMaskedLM(Model):


@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
class StructBertForMaskedLM(AliceMindBaseForMaskedLM):
# The StructBert for MaskedLM uses the same underlying model structure
# as the base model class.
pass
class StructBertForMaskedLM(MaskedLanguageModelBase):

def build_model(self):
from sofa import SbertForMaskedLM
return SbertForMaskedLM.from_pretrained(self.model_dir)


@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
class VecoForMaskedLM(AliceMindBaseForMaskedLM):
# The Veco for MaskedLM uses the same underlying model structure
# as the base model class.
pass
class VecoForMaskedLM(MaskedLanguageModelBase):

def build_model(self):
from sofa import VecoForMaskedLM
return VecoForMaskedLM.from_pretrained(self.model_dir)


@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
class BertForMaskedLM(MaskedLanguageModelBase):

def build_model(self):
from transformers import BertForMaskedLM
return BertForMaskedLM.from_pretrained(self.model_dir)

+ 21
- 15
modelscope/pipelines/nlp/fill_mask_pipeline.py View File

@@ -1,32 +1,34 @@
import os
from typing import Dict, Optional, Union

from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.nlp.masked_language_model import \
AliceMindBaseForMaskedLM
from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks
from ...metainfo import Pipelines
from ...models import Model
from ...models.nlp.masked_language_model import MaskedLanguageModelBase
from ...preprocessors import FillMaskPreprocessor
from ...utils.config import Config
from ...utils.constant import ModelFile, Tasks
from ..base import Pipeline, Tensor
from ..builder import PIPELINES

__all__ = ['FillMaskPipeline']
_type_map = {'veco': 'roberta', 'sbert': 'bert'}


@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
class FillMaskPipeline(Pipeline):

def __init__(self,
model: Union[AliceMindBaseForMaskedLM, str],
model: Union[MaskedLanguageModelBase, str],
preprocessor: Optional[FillMaskPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction

Args:
model (AliceMindBaseForMaskedLM): a model instance
model (MaskedLanguageModelBase): a model instance
preprocessor (FillMaskPreprocessor): a preprocessor instance
"""
fill_mask_model = model if isinstance(
model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
model, MaskedLanguageModelBase) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = FillMaskPreprocessor(
fill_mask_model.model_dir,
@@ -34,11 +36,13 @@ class FillMaskPipeline(Pipeline):
second_sequence=None)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.preprocessor = preprocessor
self.config = Config.from_file(
os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
self.tokenizer = preprocessor.tokenizer
self.mask_id = {'veco': 250001, 'sbert': 103}
self.mask_id = {'roberta': 250001, 'bert': 103}

self.rep_map = {
'sbert': {
'bert': {
'[unused0]': '',
'[PAD]': '',
'[unused1]': '',
@@ -48,7 +52,7 @@ class FillMaskPipeline(Pipeline):
'[CLS]': '',
'[UNK]': ''
},
'veco': {
'roberta': {
r' +': ' ',
'<mask>': '<q>',
'<pad>': '',
@@ -72,7 +76,9 @@ class FillMaskPipeline(Pipeline):
input_ids = inputs['input_ids'].detach().numpy()
pred_ids = np.argmax(logits, axis=-1)
model_type = self.model.config.model_type
rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids,
process_type = model_type if model_type in self.mask_id else _type_map[
model_type]
rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
input_ids)

def rep_tokens(string, rep_map):
@@ -82,12 +88,12 @@ class FillMaskPipeline(Pipeline):

pred_strings = []
for ids in rst_ids: # batch
if self.model.config.vocab_size == 21128: # zh bert
if 'language' in self.config.model and self.config.model.language == 'zh':
pred_string = self.tokenizer.convert_ids_to_tokens(ids)
pred_string = ''.join(pred_string)
else:
pred_string = self.tokenizer.decode(ids)
pred_string = rep_tokens(pred_string, self.rep_map[model_type])
pred_string = rep_tokens(pred_string, self.rep_map[process_type])
pred_strings.append(pred_string)

return {'text': pred_strings}

+ 7
- 4
modelscope/preprocessors/nlp.py View File

@@ -192,14 +192,17 @@ class FillMaskPreprocessor(Preprocessor):
model_dir (str): model path
"""
super().__init__(*args, **kwargs)
from sofa.utils.backend import AutoTokenizer
self.model_dir = model_dir
self.first_sequence: str = kwargs.pop('first_sequence',
'first_sequence')
self.sequence_length = kwargs.pop('sequence_length', 128)

self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, use_fast=False)
try:
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
except KeyError:
from sofa.utils.backend import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir, use_fast=False)

@type_assert(object, str)
def __call__(self, data: str) -> Dict[str, Any]:


+ 35
- 1
tests/pipelines/test_fill_mask.py View File

@@ -3,7 +3,8 @@ import unittest

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM
from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
VecoForMaskedLM)
from modelscope.pipelines import FillMaskPipeline, pipeline
from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks
@@ -16,6 +17,7 @@ class FillMaskTest(unittest.TestCase):
'en': 'damo/nlp_structbert_fill-mask_english-large'
}
model_id_veco = 'damo/nlp_veco_fill-mask-large'
model_id_bert = 'damo/nlp_bert_fill-mask_chinese-base'

ori_texts = {
'zh':
@@ -69,6 +71,20 @@ class FillMaskTest(unittest.TestCase):
f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
)

# zh bert
language = 'zh'
model_dir = snapshot_download(self.model_id_bert)
preprocessor = FillMaskPreprocessor(
model_dir, first_sequence='sentence', second_sequence=None)
model = BertForMaskedLM(model_dir)
pipeline1 = FillMaskPipeline(model, preprocessor)
pipeline2 = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language]
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_from_modelhub(self):
# sbert
@@ -97,6 +113,18 @@ class FillMaskTest(unittest.TestCase):
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n')

# zh bert
model = Model.from_pretrained(self.model_id_bert)
preprocessor = FillMaskPreprocessor(
model.model_dir, first_sequence='sentence', second_sequence=None)
pipeline_ins = pipeline(
Tasks.fill_mask, model=model, preprocessor=preprocessor)
language = 'zh'
ori_text = self.ori_texts[language]
test_input = self.test_inputs[language]
print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
f'{pipeline_ins(test_input)}\n')

@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_model_name(self):
# veco
@@ -115,6 +143,12 @@ class FillMaskTest(unittest.TestCase):
f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
f'{pipeline_ins(self.test_inputs[language])}\n')

# bert
pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
print(
f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
f'{pipeline_ins(self.test_inputs[language])}\n')

@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.fill_mask)


Loading…
Cancel
Save