Browse Source

unfinished

master
雨泓 3 years ago
parent
commit
31c774936b
12 changed files with 100 additions and 91 deletions
  1. +8
    -4
      modelscope/metainfo.py
  2. +6
    -0
      modelscope/models/nlp/masked_language_model.py
  3. +1
    -1
      modelscope/models/nlp/sbert_for_nli.py
  4. +3
    -3
      modelscope/models/nlp/sbert_for_token_classification.py
  5. +16
    -12
      modelscope/pipelines/nlp/fill_mask_pipeline.py
  6. +16
    -20
      modelscope/pipelines/nlp/nli_pipeline.py
  7. +5
    -3
      modelscope/pipelines/nlp/sentence_similarity_pipeline.py
  8. +11
    -16
      modelscope/pipelines/nlp/sentiment_classification_pipeline.py
  9. +6
    -6
      modelscope/pipelines/nlp/text_generation_pipeline.py
  10. +9
    -8
      modelscope/pipelines/nlp/word_segmentation_pipeline.py
  11. +7
    -6
      modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
  12. +12
    -12
      modelscope/preprocessors/nlp.py

+ 8
- 4
modelscope/metainfo.py View File

@@ -46,6 +46,10 @@ class Pipelines(object):
word_segmentation = 'word-segmentation'
text_generation = 'text-generation'
sentiment_analysis = 'sentiment-analysis'
sentiment_classification = "sentiment-classification"
zero_shot_classification = "zero-shot-classification"
fill_mask = "fill-mask"
nli = "nli"

# audio tasks
sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
@@ -85,10 +89,10 @@ class Preprocessors(object):
# nlp preprocessor
bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
sbert_nli_tokenizer = 'sbert-nli-tokenizer'
sbert_sen_cls_tokenizer = 'sbert-sen-cls-tokenizer'
sbert_zero_shot_cls_tokenizer = 'sbert-zero-shot-cls-tokenizer'
token_cls_tokenizer = 'token-cls-tokenizer'
nli_tokenizer = 'nli-tokenizer'
sen_cls_tokenizer = 'sen-cls-tokenizer'
zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'

# audio preprocessor
linear_aec_fbank = 'linear-aec-fbank'


+ 6
- 0
modelscope/models/nlp/masked_language_model.py View File

@@ -19,6 +19,12 @@ class MaskedLMModelBase(Model):
def build_model(self):
raise NotImplementedError()

@property
def config(self):
if hasattr(self.model, "config"):
return self.model.config
return None

def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
"""return the result by the model



+ 1
- 1
modelscope/models/nlp/sbert_for_nli.py View File

@@ -1,4 +1,4 @@
from modelscope.utils.constant import Tasks
from ...utils.constant import Tasks
from .sbert_for_sequence_classification import SbertForSequenceClassificationBase
from ..builder import MODELS
from ...metainfo import Models


+ 3
- 3
modelscope/models/nlp/sbert_for_token_classification.py View File

@@ -2,18 +2,17 @@ from typing import Any, Dict, Union

import numpy as np
import torch
from sofa import SbertConfig, SbertForTokenClassification

from modelscope.metainfo import Models
from modelscope.utils.constant import Tasks
from ..base import Model, Tensor
from ..builder import MODELS

__all__ = ['StructBertForTokenClassification']
__all__ = ['SbertForTokenClassification']


@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
class StructBertForTokenClassification(Model):
class SbertForTokenClassification(Model):

def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the word segmentation model from the `model_dir` path.
@@ -25,6 +24,7 @@ class StructBertForTokenClassification(Model):
"""
super().__init__(model_dir, *args, **kwargs)
self.model_dir = model_dir
from sofa import SbertConfig, SbertForTokenClassification
self.model = SbertForTokenClassification.from_pretrained(
self.model_dir)
self.config = SbertConfig.from_pretrained(self.model_dir)


+ 16
- 12
modelscope/pipelines/nlp/fill_mask_pipeline.py View File

@@ -1,38 +1,41 @@
from typing import Dict, Optional, Union

from modelscope.models import Model
from modelscope.models.nlp.masked_language_model import \
AliceMindBaseForMaskedLM
from modelscope.preprocessors import FillMaskPreprocessor
from modelscope.utils.constant import Tasks
from ...models import Model
from ...models.nlp.masked_language_model import \
MaskedLMModelBase
from ...preprocessors import FillMaskPreprocessor
from ...utils.constant import Tasks
from ..base import Pipeline, Tensor
from ..builder import PIPELINES
from ...metainfo import Pipelines

__all__ = ['FillMaskPipeline']


@PIPELINES.register_module(Tasks.fill_mask, module_name=r'sbert')
@PIPELINES.register_module(Tasks.fill_mask, module_name=r'veco')
@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
class FillMaskPipeline(Pipeline):

def __init__(self,
model: Union[AliceMindBaseForMaskedLM, str],
model: Union[MaskedLMModelBase, str],
preprocessor: Optional[FillMaskPreprocessor] = None,
first_sequence="sentense",
**kwargs):
"""use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction

Args:
model (AliceMindBaseForMaskedLM): a model instance
model (MaskedLMModelBase): a model instance
preprocessor (FillMaskPreprocessor): a preprocessor instance
"""
fill_mask_model = model if isinstance(
model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
model, MaskedLMModelBase) else Model.from_pretrained(model)
assert fill_mask_model.config is not None

if preprocessor is None:
preprocessor = FillMaskPreprocessor(
fill_mask_model.model_dir,
first_sequence='sentence',
first_sequence=first_sequence,
second_sequence=None)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
super().__init__(model=fill_mask_model, preprocessor=preprocessor, **kwargs)
self.preprocessor = preprocessor
self.tokenizer = preprocessor.tokenizer
self.mask_id = {'veco': 250001, 'sbert': 103}
@@ -82,6 +85,7 @@ class FillMaskPipeline(Pipeline):

pred_strings = []
for ids in rst_ids: # batch
# TODO vocab size is not stable
if self.model.config.vocab_size == 21128: # zh bert
pred_string = self.tokenizer.convert_ids_to_tokens(ids)
pred_string = ''.join(pred_string)


+ 16
- 20
modelscope/pipelines/nlp/nli_pipeline.py View File

@@ -1,27 +1,31 @@
import os
import uuid
from typing import Any, Dict, Union

import json
import uuid
from typing import Any, Dict, Union

import numpy as np

from modelscope.models.nlp import SbertForNLI
from modelscope.preprocessors import NLIPreprocessor
from modelscope.utils.constant import Tasks
from ...models import Model
from ..base import Input, Pipeline
from ..base import Pipeline
from ..builder import PIPELINES
from ...metainfo import Pipelines
from ...models import Model
from ...models.nlp import SbertForNLI
from ...preprocessors import NLIPreprocessor
from ...utils.constant import Tasks

__all__ = ['NLIPipeline']


@PIPELINES.register_module(
Tasks.nli, module_name=r'nlp_structbert_nli_chinese-base')
Tasks.nli, module_name=Pipelines.nli)
class NLIPipeline(Pipeline):

def __init__(self,
model: Union[SbertForNLI, str],
preprocessor: NLIPreprocessor = None,
first_sequence="first_sequence",
second_sequence="second_sequence",
**kwargs):
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

@@ -36,20 +40,12 @@ class NLIPipeline(Pipeline):
if preprocessor is None:
preprocessor = NLIPreprocessor(
sc_model.model_dir,
first_sequence='first_sequence',
second_sequence='second_sequence')
first_sequence=first_sequence,
second_sequence=second_sequence)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
assert len(sc_model.id2label) > 0

self.label_path = os.path.join(sc_model.model_dir,
'label_mapping.json')
with open(self.label_path) as f:
self.label_mapping = json.load(f)
self.label_id_to_name = {
idx: name
for name, idx in self.label_mapping.items()
}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
"""process the prediction results

Args:


+ 5
- 3
modelscope/pipelines/nlp/sentence_similarity_pipeline.py View File

@@ -20,6 +20,8 @@ class SentenceSimilarityPipeline(Pipeline):
def __init__(self,
model: Union[Model, str],
preprocessor: SequenceClassificationPreprocessor = None,
first_sequence="first_sequence",
second_sequence="second_sequence",
**kwargs):
"""use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction

@@ -35,14 +37,14 @@ class SentenceSimilarityPipeline(Pipeline):
if preprocessor is None:
preprocessor = SequenceClassificationPreprocessor(
sc_model.model_dir,
first_sequence='first_sequence',
second_sequence='second_sequence')
first_sequence=first_sequence,
second_sequence=second_sequence)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

assert hasattr(self.model, 'id2label'), \
'id2label map should be initalizaed in init function.'

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
"""process the prediction results

Args:


+ 11
- 16
modelscope/pipelines/nlp/sentiment_classification_pipeline.py View File

@@ -5,24 +5,27 @@ from typing import Any, Dict, Union
import json
import numpy as np

from modelscope.models.nlp import SbertForSentimentClassification
from modelscope.preprocessors import SentimentClassificationPreprocessor
from modelscope.utils.constant import Tasks
from ...models.nlp import SbertForSentimentClassification
from ...preprocessors import SentimentClassificationPreprocessor
from ...utils.constant import Tasks
from ...models import Model
from ..base import Input, Pipeline
from ..builder import PIPELINES
from ...metainfo import Pipelines

__all__ = ['SentimentClassificationPipeline']


@PIPELINES.register_module(
Tasks.sentiment_classification,
module_name=r'sbert-sentiment-classification')
module_name=Pipelines.sentiment_classification)
class SentimentClassificationPipeline(Pipeline):

def __init__(self,
model: Union[SbertForSentimentClassification, str],
preprocessor: SentimentClassificationPreprocessor = None,
first_sequence="first_sequence",
second_sequence="second_sequence",
**kwargs):
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

@@ -38,20 +41,12 @@ class SentimentClassificationPipeline(Pipeline):
if preprocessor is None:
preprocessor = SentimentClassificationPreprocessor(
sc_model.model_dir,
first_sequence='first_sequence',
second_sequence='second_sequence')
first_sequence=first_sequence,
second_sequence=second_sequence)
super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
assert len(sc_model.id2label) > 0

self.label_path = os.path.join(sc_model.model_dir,
'label_mapping.json')
with open(self.label_path) as f:
self.label_mapping = json.load(f)
self.label_id_to_name = {
idx: name
for name, idx in self.label_mapping.items()
}

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
"""process the prediction results

Args:


+ 6
- 6
modelscope/pipelines/nlp/text_generation_pipeline.py View File

@@ -1,10 +1,10 @@
from typing import Dict, Optional, Union

from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.nlp import PalmForTextGeneration
from modelscope.preprocessors import TextGenerationPreprocessor
from modelscope.utils.constant import Tasks
from ...metainfo import Pipelines
from ...models import Model
from ...models.nlp import PalmForTextGeneration
from ...preprocessors import TextGenerationPreprocessor
from ...utils.constant import Tasks
from ..base import Pipeline, Tensor
from ..builder import PIPELINES

@@ -36,7 +36,7 @@ class TextGenerationPipeline(Pipeline):
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = model.tokenizer

def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params) -> Dict[str, str]:
"""process the prediction results

Args:


+ 9
- 8
modelscope/pipelines/nlp/word_segmentation_pipeline.py View File

@@ -1,10 +1,10 @@
from typing import Any, Dict, Optional, Union

from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.nlp import StructBertForTokenClassification
from modelscope.preprocessors import TokenClassifcationPreprocessor
from modelscope.utils.constant import Tasks
from ...metainfo import Pipelines
from ...models import Model
from ...models.nlp import SbertForTokenClassification
from ...preprocessors import TokenClassifcationPreprocessor
from ...utils.constant import Tasks
from ..base import Pipeline, Tensor
from ..builder import PIPELINES

@@ -16,7 +16,7 @@ __all__ = ['WordSegmentationPipeline']
class WordSegmentationPipeline(Pipeline):

def __init__(self,
model: Union[StructBertForTokenClassification, str],
model: Union[SbertForTokenClassification, str],
preprocessor: Optional[TokenClassifcationPreprocessor] = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
@@ -27,15 +27,16 @@ class WordSegmentationPipeline(Pipeline):
"""
model = model if isinstance(
model,
StructBertForTokenClassification) else Model.from_pretrained(model)
SbertForTokenClassification) else Model.from_pretrained(model)
if preprocessor is None:
preprocessor = TokenClassifcationPreprocessor(model.model_dir)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.tokenizer = preprocessor.tokenizer
self.config = model.config
assert len(self.config.id2label) > 0
self.id2label = self.config.id2label

def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
def postprocess(self, inputs: Dict[str, Any], **postprocess_params) -> Dict[str, str]:
"""process the prediction results

Args:


+ 7
- 6
modelscope/pipelines/nlp/zero_shot_classification_pipeline.py View File

@@ -6,10 +6,11 @@ import json
import numpy as np
from scipy.special import softmax

from modelscope.models.nlp import SbertForZeroShotClassification
from modelscope.preprocessors import SbertZeroShotClassificationPreprocessor
from modelscope.utils.constant import Tasks
from ...models.nlp import SbertForZeroShotClassification
from ...preprocessors import ZeroShotClassificationPreprocessor
from ...utils.constant import Tasks
from ...models import Model
from ...metainfo import Pipelines
from ..base import Input, Pipeline
from ..builder import PIPELINES

@@ -18,12 +19,12 @@ __all__ = ['ZeroShotClassificationPipeline']

@PIPELINES.register_module(
Tasks.zero_shot_classification,
module_name=r'bert-zero-shot-classification')
module_name=Pipelines.zero_shot_classification)
class ZeroShotClassificationPipeline(Pipeline):

def __init__(self,
model: Union[SbertForZeroShotClassification, str],
preprocessor: SbertZeroShotClassificationPreprocessor = None,
preprocessor: ZeroShotClassificationPreprocessor = None,
**kwargs):
"""use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

@@ -32,7 +33,7 @@ class ZeroShotClassificationPipeline(Pipeline):
preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
"""
assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
'model must be a single str or BertForZeroShotClassification'
'model must be a single str or SbertForZeroShotClassification'
sc_model = model if isinstance(
model,
SbertForZeroShotClassification) else Model.from_pretrained(model)


+ 12
- 12
modelscope/preprocessors/nlp.py View File

@@ -14,9 +14,9 @@ from .builder import PREPROCESSORS

__all__ = [
'Tokenize', 'SequenceClassificationPreprocessor',
'PalmTextGenerationPreprocessor', 'SbertZeroShotClassificationPreprocessor',
'SbertTokenClassifcationPreprocessor', 'SbertNLIPreprocessor',
'SbertSentimentClassificationPreprocessor', 'FillMaskPreprocessor'
'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor',
'TokenClassifcationPreprocessor', 'NLIPreprocessor',
'SentimentClassificationPreprocessor', 'FillMaskPreprocessor'
]


@@ -35,8 +35,8 @@ class Tokenize(Preprocessor):


@PREPROCESSORS.register_module(
Fields.nlp, module_name=Preprocessors.sbert_nli_tokenizer)
class SbertNLIPreprocessor(Preprocessor):
Fields.nlp, module_name=Preprocessors.nli_tokenizer)
class NLIPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
@@ -105,8 +105,8 @@ class SbertNLIPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
Fields.nlp, module_name=Preprocessors.sbert_sen_cls_tokenizer)
class SbertSentimentClassificationPreprocessor(Preprocessor):
Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
class SentimentClassificationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
@@ -264,7 +264,7 @@ class SequenceClassificationPreprocessor(Preprocessor):

@PREPROCESSORS.register_module(
Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer)
class PalmTextGenerationPreprocessor(Preprocessor):
class TextGenerationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
"""preprocess the data using the vocab.txt from the `model_dir` path
@@ -374,8 +374,8 @@ class FillMaskPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
Fields.nlp, module_name=Preprocessors.sbert_zero_shot_cls_tokenizer)
class SbertZeroShotClassificationPreprocessor(Preprocessor):
Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
class ZeroShotClassificationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path
@@ -418,8 +418,8 @@ class SbertZeroShotClassificationPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
Fields.nlp, module_name=Preprocessors.sbert_token_cls_tokenizer)
class SbertTokenClassifcationPreprocessor(Preprocessor):
Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
class TokenClassifcationPreprocessor(Preprocessor):

def __init__(self, model_dir: str, *args, **kwargs):
"""preprocess the data via the vocab.txt from the `model_dir` path


Loading…
Cancel
Save