From f9e12669baaa78e32ea1d552e6f68010f41fb56e Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 27 Oct 2022 09:33:19 +0800 Subject: [PATCH] [to #42322933]add default mapping for preprocessors Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10536603 --- .../nlp/faq_question_answering_pipeline.py | 4 - modelscope/preprocessors/base.py | 157 ++++++++++++++++-- 2 files changed, 143 insertions(+), 18 deletions(-) diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index fd614e91..3917f20c 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -26,10 +26,6 @@ class FaqQuestionAnsweringPipeline(Pipeline): if preprocessor is None: preprocessor = Preprocessor.from_pretrained( model.model_dir, **kwargs) - if preprocessor is None: - from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor - preprocessor = FaqQuestionAnsweringPreprocessor( - model.model_dir, **kwargs) super().__init__(model=model, preprocessor=preprocessor, **kwargs) def _sanitize_parameters(self, **pipeline_parameters): diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py index c2716a13..db14ba47 100644 --- a/modelscope/preprocessors/base.py +++ b/modelscope/preprocessors/base.py @@ -4,7 +4,8 @@ from abc import ABC, abstractmethod from copy import deepcopy from typing import Any, Dict, Optional, Sequence -from modelscope.utils.config import Config +from modelscope.metainfo import Models, Preprocessors +from modelscope.utils.config import Config, ConfigDict from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks from modelscope.utils.hub import read_config, snapshot_download from modelscope.utils.logger import get_logger @@ -12,6 +13,112 @@ from .builder import build_preprocessor logger = get_logger(__name__) +PREPROCESSOR_MAP = { + # nlp + # bart + (Models.bart, Tasks.text_error_correction): + Preprocessors.text_error_correction, + + # bert + (Models.bert, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.document_segmentation): + Preprocessors.document_segmentation, + (Models.bert, Tasks.fill_mask): + Preprocessors.fill_mask, + (Models.bert, Tasks.sentence_embedding): + Preprocessors.sentence_embedding, + (Models.bert, Tasks.text_classification): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.nli): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.sentiment_classification): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.sentence_similarity): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.zero_shot_classification): + Preprocessors.sen_cls_tokenizer, + (Models.bert, Tasks.text_ranking): + Preprocessors.text_ranking, + (Models.bert, Tasks.part_of_speech): + Preprocessors.token_cls_tokenizer, + (Models.bert, Tasks.token_classification): + Preprocessors.token_cls_tokenizer, + (Models.bert, Tasks.word_segmentation): + Preprocessors.token_cls_tokenizer, + + # bloom + (Models.bloom, Tasks.backbone): + Preprocessors.text_gen_tokenizer, + + # gpt_neo + # gpt_neo may have different preprocessors, but now only one + (Models.gpt_neo, Tasks.backbone): + Preprocessors.sentence_piece, + + # gpt3 has different preprocessors by different sizes of models, so they are not listed here. + + # palm_v2 + (Models.palm, Tasks.backbone): + Preprocessors.text_gen_tokenizer, + + # T5 + (Models.T5, Tasks.backbone): + Preprocessors.text2text_gen_preprocessor, + (Models.T5, Tasks.text2text_generation): + Preprocessors.text2text_gen_preprocessor, + + # deberta_v2 + (Models.deberta_v2, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.deberta_v2, Tasks.fill_mask): + Preprocessors.fill_mask, + + # ponet + (Models.ponet, Tasks.fill_mask): + Preprocessors.fill_mask_ponet, + + # structbert + (Models.structbert, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.fill_mask): + Preprocessors.fill_mask, + (Models.structbert, Tasks.faq_question_answering): + Preprocessors.faq_question_answering_preprocessor, + (Models.structbert, Tasks.text_classification): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.nli): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.sentiment_classification): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.sentence_similarity): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.zero_shot_classification): + Preprocessors.sen_cls_tokenizer, + (Models.structbert, Tasks.part_of_speech): + Preprocessors.token_cls_tokenizer, + (Models.structbert, Tasks.token_classification): + Preprocessors.token_cls_tokenizer, + (Models.structbert, Tasks.word_segmentation): + Preprocessors.token_cls_tokenizer, + + # veco + (Models.veco, Tasks.backbone): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.fill_mask): + Preprocessors.fill_mask, + (Models.veco, Tasks.text_classification): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.nli): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.sentiment_classification): + Preprocessors.sen_cls_tokenizer, + (Models.veco, Tasks.sentence_similarity): + Preprocessors.sen_cls_tokenizer, + + # space +} + class Preprocessor(ABC): @@ -56,37 +163,59 @@ class Preprocessor(ABC): if 'task' in kwargs: task = kwargs.pop('task') field_name = Tasks.find_field_by_task(task) + sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val' + if not hasattr(cfg, 'preprocessor'): logger.error('No preprocessor field found in cfg.') - return None - - sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val' + preprocessor_cfg = ConfigDict() + else: + preprocessor_cfg = cfg.preprocessor - if 'type' not in cfg.preprocessor: - if sub_key in cfg.preprocessor: - sub_cfg = getattr(cfg.preprocessor, sub_key) + if 'type' not in preprocessor_cfg: + if sub_key in preprocessor_cfg: + sub_cfg = getattr(preprocessor_cfg, sub_key) else: logger.error( f'No {sub_key} key and type key found in ' f'preprocessor domain of configuration.json file.') - return None + sub_cfg = preprocessor_cfg else: - sub_cfg = cfg.preprocessor + sub_cfg = preprocessor_cfg - if len(sub_cfg): + sub_cfg.update({'model_dir': model_dir}) + sub_cfg.update(kwargs) + if 'type' in sub_cfg: if isinstance(sub_cfg, Sequence): # TODO: for Sequence, need adapt to `mode` and `mode_dir` args, # and add mode for Compose or other plans raise NotImplementedError('Not supported yet!') sub_cfg = deepcopy(sub_cfg) - sub_cfg.update({'model_dir': model_dir}) - sub_cfg.update(kwargs) + preprocessor = build_preprocessor(sub_cfg, field_name) else: logger.error( f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, ' - f'please check the preprocessor field in the configuration.json file.' + f'current config: {sub_cfg}. trying to build by task and model information.' ) - return None + model_cfg = getattr(cfg, 'model', ConfigDict()) + model_type = model_cfg.type if hasattr( + model_cfg, 'type') else getattr(model_cfg, 'model_type', None) + if task is None or model_type is None: + logger.error( + f'Find task: {task}, model type: {model_type}. ' + f'Insufficient information to build preprocessor, skip building preprocessor' + ) + return None + if (model_type, task) not in PREPROCESSOR_MAP: + logger.error( + f'No preprocessor key {(model_type, task)} found in PREPROCESSOR_MAP, ' + f'skip building preprocessor.') + return None + + sub_cfg = ConfigDict({ + 'type': PREPROCESSOR_MAP[(model_type, task)], + **sub_cfg + }) + preprocessor = build_preprocessor(sub_cfg, field_name) preprocessor.mode = preprocessor_mode return preprocessor