From 08e03ca9b071338d958e24f8d639f0ef71fea2d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=BA=E4=B8=9E?= Date: Thu, 23 Jun 2022 19:53:29 +0800 Subject: [PATCH] remove zeroshot --- modelscope/metainfo.py | 2 - modelscope/models/__init__.py | 4 +- modelscope/models/nlp/__init__.py | 1 - .../nlp/sbert_for_zero_shot_classification.py | 50 ---------- modelscope/pipelines/builder.py | 3 - modelscope/pipelines/nlp/__init__.py | 1 - .../nlp/zero_shot_classification_pipeline.py | 98 ------------------- modelscope/preprocessors/nlp.py | 50 +--------- modelscope/utils/constant.py | 1 - .../test_zero_shot_classification.py | 64 ------------ 10 files changed, 5 insertions(+), 269 deletions(-) delete mode 100644 modelscope/models/nlp/sbert_for_zero_shot_classification.py delete mode 100644 modelscope/pipelines/nlp/zero_shot_classification_pipeline.py delete mode 100644 tests/pipelines/test_zero_shot_classification.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 388d8397..13028278 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -48,7 +48,6 @@ class Pipelines(object): text_generation = 'text-generation' sentiment_analysis = 'sentiment-analysis' sentiment_classification = 'sentiment-classification' - zero_shot_classification = 'zero-shot-classification' fill_mask = 'fill-mask' nli = 'nli' dialog_intent_prediction = 'dialog-intent-prediction' @@ -95,7 +94,6 @@ class Preprocessors(object): token_cls_tokenizer = 'token-cls-tokenizer' nli_tokenizer = 'nli-tokenizer' sen_cls_tokenizer = 'sen-cls-tokenizer' - zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer' # audio preprocessor linear_aec_fbank = 'linear-aec-fbank' diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py index 629f2270..6e769d8d 100644 --- a/modelscope/models/__init__.py +++ b/modelscope/models/__init__.py @@ -7,5 +7,5 @@ from .builder import MODELS, build_model from .multi_model import OfaForImageCaptioning from .nlp import (BertForSequenceClassification, SbertForNLI, SbertForSentenceSimilarity, SbertForSentimentClassification, - SbertForTokenClassification, SbertForZeroShotClassification, - StructBertForMaskedLM, VecoForMaskedLM) + SbertForTokenClassification, StructBertForMaskedLM, + VecoForMaskedLM) diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 78b087e6..399aa63f 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -5,6 +5,5 @@ from .sbert_for_nli import * # noqa F403 from .sbert_for_sentence_similarity import * # noqa F403 from .sbert_for_sentiment_classification import * # noqa F403 from .sbert_for_token_classification import * # noqa F403 -from .sbert_for_zero_shot_classification import * # noqa F403 from .space.dialog_intent_prediction_model import * # noqa F403 from .space.dialog_modeling_model import * # noqa F403 diff --git a/modelscope/models/nlp/sbert_for_zero_shot_classification.py b/modelscope/models/nlp/sbert_for_zero_shot_classification.py deleted file mode 100644 index 837bb41e..00000000 --- a/modelscope/models/nlp/sbert_for_zero_shot_classification.py +++ /dev/null @@ -1,50 +0,0 @@ -from typing import Any, Dict - -import numpy as np - -from modelscope.utils.constant import Tasks -from ...metainfo import Models -from ..base import Model -from ..builder import MODELS - -__all__ = ['SbertForZeroShotClassification'] - - -@MODELS.register_module( - Tasks.zero_shot_classification, module_name=Models.structbert) -class SbertForZeroShotClassification(Model): - - def __init__(self, model_dir: str, *args, **kwargs): - """initialize the zero shot classification model from the `model_dir` path. - - Args: - model_dir (str): the model path. - """ - - super().__init__(model_dir, *args, **kwargs) - from sofa import SbertForSequenceClassification - self.model = SbertForSequenceClassification.from_pretrained(model_dir) - - def train(self): - return self.model.train() - - def eval(self): - return self.model.eval() - - def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: - """return the result by the model - - Args: - input (Dict[str, Any]): the preprocessed data - - Returns: - Dict[str, np.ndarray]: results - Example: - { - 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value - } - """ - outputs = self.model(**input) - logits = outputs['logits'].numpy() - res = {'logits': logits} - return res diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index b0f2955c..ebbdf01b 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -31,9 +31,6 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/nlp_structbert_sentiment-classification_chinese-base'), Tasks.text_classification: ('bert-sentiment-analysis', 'damo/bert-base-sst2'), - Tasks.zero_shot_classification: - (Pipelines.zero_shot_classification, - 'damo/nlp_structbert_zero-shot-classification_chinese-base'), Tasks.image_matting: (Pipelines.image_matting, 'damo/cv_unet_image-matting'), Tasks.text_classification: (Pipelines.sentiment_analysis, diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index 7aebf3e8..76e0ff4e 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -7,4 +7,3 @@ from .sentiment_classification_pipeline import * # noqa F403 from .sequence_classification_pipeline import * # noqa F403 from .text_generation_pipeline import * # noqa F403 from .word_segmentation_pipeline import * # noqa F403 -from .zero_shot_classification_pipeline import * # noqa F403 diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py deleted file mode 100644 index 13ac5d52..00000000 --- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import uuid -from typing import Any, Dict, Union - -import json -import numpy as np -import torch -from scipy.special import softmax - -from ...metainfo import Pipelines -from ...models import Model -from ...models.nlp import SbertForZeroShotClassification -from ...preprocessors import ZeroShotClassificationPreprocessor -from ...utils.constant import Tasks -from ..base import Input, Pipeline -from ..builder import PIPELINES - -__all__ = ['ZeroShotClassificationPipeline'] - - -@PIPELINES.register_module( - Tasks.zero_shot_classification, - module_name=Pipelines.zero_shot_classification) -class ZeroShotClassificationPipeline(Pipeline): - - def __init__(self, - model: Union[SbertForZeroShotClassification, str], - preprocessor: ZeroShotClassificationPreprocessor = None, - **kwargs): - """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction - - Args: - model (SbertForSentimentClassification): a model instance - preprocessor (SentimentClassificationPreprocessor): a preprocessor instance - """ - assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \ - 'model must be a single str or SbertForZeroShotClassification' - sc_model = model if isinstance( - model, - SbertForZeroShotClassification) else Model.from_pretrained(model) - - self.entailment_id = 0 - self.contradiction_id = 2 - - if preprocessor is None: - preprocessor = ZeroShotClassificationPreprocessor( - sc_model.model_dir) - sc_model.eval() - super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) - - def _sanitize_parameters(self, **kwargs): - preprocess_params = {} - postprocess_params = {} - - if 'candidate_labels' in kwargs: - candidate_labels = kwargs.pop('candidate_labels') - preprocess_params['candidate_labels'] = candidate_labels - postprocess_params['candidate_labels'] = candidate_labels - else: - raise ValueError('You must include at least one label.') - preprocess_params['hypothesis_template'] = kwargs.pop( - 'hypothesis_template', '{}') - - postprocess_params['multi_label'] = kwargs.pop('multi_label', False) - return preprocess_params, {}, postprocess_params - - def forward(self, inputs: Dict[str, Any], - **forward_params) -> Dict[str, Any]: - with torch.no_grad(): - return super().forward(inputs, **forward_params) - - def postprocess(self, - inputs: Dict[str, Any], - candidate_labels, - multi_label=False) -> Dict[str, Any]: - """process the prediction results - - Args: - inputs (Dict[str, Any]): _description_ - - Returns: - Dict[str, Any]: the prediction results - """ - - logits = inputs['logits'] - if multi_label or len(candidate_labels) == 1: - logits = logits[..., [self.contradiction_id, self.entailment_id]] - scores = softmax(logits, axis=-1)[..., 1] - else: - logits = logits[..., self.entailment_id] - scores = softmax(logits, axis=-1) - - reversed_index = list(reversed(scores.argsort())) - result = { - 'labels': [candidate_labels[i] for i in reversed_index], - 'scores': [scores[i].item() for i in reversed_index], - } - return result diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 8346402c..5cd9463d 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -13,9 +13,9 @@ from .builder import PREPROCESSORS __all__ = [ 'Tokenize', 'SequenceClassificationPreprocessor', - 'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor', - 'TokenClassifcationPreprocessor', 'NLIPreprocessor', - 'SentimentClassificationPreprocessor', 'FillMaskPreprocessor' + 'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor', + 'NLIPreprocessor', 'SentimentClassificationPreprocessor', + 'FillMaskPreprocessor' ] @@ -372,50 +372,6 @@ class FillMaskPreprocessor(Preprocessor): return {k: torch.tensor(v) for k, v in rst.items()} -@PREPROCESSORS.register_module( - Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer) -class ZeroShotClassificationPreprocessor(Preprocessor): - - def __init__(self, model_dir: str, *args, **kwargs): - """preprocess the data via the vocab.txt from the `model_dir` path - - Args: - model_dir (str): model path - """ - - super().__init__(*args, **kwargs) - - from sofa import SbertTokenizer - self.model_dir: str = model_dir - self.sequence_length = kwargs.pop('sequence_length', 512) - self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) - - @type_assert(object, str) - def __call__(self, data: str, hypothesis_template: str, - candidate_labels: list) -> Dict[str, Any]: - """process the raw input data - - Args: - data (str): a sentence - Example: - 'you are so handsome.' - - Returns: - Dict[str, Any]: the preprocessed data - """ - pairs = [[data, hypothesis_template.format(label)] - for label in candidate_labels] - - features = self.tokenizer( - pairs, - padding=True, - truncation=True, - max_length=self.sequence_length, - return_tensors='pt', - truncation_strategy='only_first') - return features - - @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.token_cls_tokenizer) class TokenClassifcationPreprocessor(Preprocessor): diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 75e2d04d..85559917 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -32,7 +32,6 @@ class Tasks(object): action_recognition = 'action-recognition' # nlp tasks - zero_shot_classification = 'zero-shot-classification' word_segmentation = 'word-segmentation' nli = 'nli' sentiment_classification = 'sentiment-classification' diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py deleted file mode 100644 index 236013aa..00000000 --- a/tests/pipelines/test_zero_shot_classification.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import unittest - -from modelscope.hub.snapshot_download import snapshot_download -from modelscope.models import Model -from modelscope.models.nlp import SbertForZeroShotClassification -from modelscope.pipelines import ZeroShotClassificationPipeline, pipeline -from modelscope.preprocessors import ZeroShotClassificationPreprocessor -from modelscope.utils.constant import Tasks -from modelscope.utils.test_utils import test_level - - -class ZeroShotClassificationTest(unittest.TestCase): - model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base' - sentence = '全新突破 解放军运20版空中加油机曝光' - labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事'] - template = '这篇文章的标题是{}' - - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') - def test_run_with_direct_file_download(self): - cache_path = snapshot_download(self.model_id) - tokenizer = ZeroShotClassificationPreprocessor(cache_path) - model = SbertForZeroShotClassification(cache_path, tokenizer=tokenizer) - pipeline1 = ZeroShotClassificationPipeline( - model, preprocessor=tokenizer) - pipeline2 = pipeline( - Tasks.zero_shot_classification, - model=model, - preprocessor=tokenizer) - - print( - f'sentence: {self.sentence}\n' - f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}' - ) - print() - print( - f'sentence: {self.sentence}\n' - f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}' - ) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_with_model_from_modelhub(self): - model = Model.from_pretrained(self.model_id) - tokenizer = ZeroShotClassificationPreprocessor(model.model_dir) - pipeline_ins = pipeline( - task=Tasks.zero_shot_classification, - model=model, - preprocessor=tokenizer) - print(pipeline_ins(input=self.sentence, candidate_labels=self.labels)) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_with_model_name(self): - pipeline_ins = pipeline( - task=Tasks.zero_shot_classification, model=self.model_id) - print(pipeline_ins(input=self.sentence, candidate_labels=self.labels)) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_with_default_model(self): - pipeline_ins = pipeline(task=Tasks.zero_shot_classification) - print(pipeline_ins(input=self.sentence, candidate_labels=self.labels)) - - -if __name__ == '__main__': - unittest.main()