From 21fa71baf0c2a06d4ba54a3ec745a3d0a5cb9be3 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 3 Aug 2022 18:38:41 +0800
Subject: [PATCH] [to #42322933] add/refactor nlp models source code and
 finetune

1. add sbert,veco,palm,space source code
2. support sbert sequence classification, token classification finetune
3. support veco sequence classification finetune
4. support palm nlg finetune
evaluation result: https://sheet.alibaba-inc.com/#/sheet/f7fdcc7f22bd5105 sheet:Maas
5. add ut for finetunes
6. add veco's taskdataset processor
7. add a common trainer for nlp, and a specific trainer for veco
8. merge some duplicate codes of models, preprocessors, pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9574105

    * add basic class of hook&metrics

* pre-commit passed

* change some comments

* pre commit passed

* 1. remove accuracy's groups 2. remove useless hooks 3. simplify priorities

* pre-commit passed

* fix a comment

* Merge branch 'master' into finetune_hooks_metrics

# Conflicts:
#	modelscope/metainfo.py

* pre-commit passed

* add basic class of hook&metrics

* pre-commit passed

* change some comments

* pre commit passed

* 1. remove accuracy's groups 2. remove useless hooks 3. simplify priorities

* pre-commit passed

* fix a comment

* Merge branch 'feat/finetune' of gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib into feat/finetune

* mv hooks related to modelscope/trainers/hooks

* mv priority back

* add torch mdoel base and test

* update hooks, trainer, import_util

* add torch epoch based trainer and dis utils

* add hooks

* fix warmup

* format code stype and fix warmup and add warmup unittest

* fix impls

* pre-commit check passed

* update hook and add EpochBasedTrainer

* add trainer unittest

* Merge branch 'feat/add_hooks' into feat/add_task

# Conflicts:
#	modelscope/models/base_torch.py
#	modelscope/trainers/hooks/hook.py
#	modelscope/trainers/trainer.py

* update unittest name

* rewrite taskdataset to trainer

* fix trainer and add unittest

* add unittest

* code: run to forward

* run through... but ugly code

* arrange some cls

* fix some errs

* revert some mistakes

* init check in

* Merge branch 'feat/add_hooks' into feat/add_task

# Conflicts:
#	modelscope/trainers/trainer.py

* test with bigger epoch and size

* add the default metrics class

* move build metrics code to a method

* merge add_task

* merge origin add_task

* add device initialization

* remove preprocessor arg for bool

* add task models

* move metric collect logic to metrics class

* pre-commit passed

* fix cr comments

* precommit passed

* add task models

* Merge remote-tracking branch 'origin/feat/add_task' into feat/backbone_head

* add comment

* change comment formats.

* fix comments

* fix ut bug

* fix comments

* add wrapper check

* fix comments

* pre commit passed

* fix cr comments

* solve a loop import problem

* fix ut bug

* fix ut errors

* change dummydataset to msdataset

* precommit passed

* merge add task

* backbone-head is build, model is not correctly loaded

* model load states matched

* result matched

* lint

* add veco/palm_v2 code

* merge master

* merge master success running

* add repr model name level

* Merge branch 'feat/veco_palm' into feat/finetune_sbert_veco

* model test for training

* add token-classification metric add formal ut

* fix running bug

* finetune and pipeline are working with backbone-head

* add nli

* add missing code

* finetune and pipeline are working with backbone-head

* Merge branch 'feat/backbone_head' of http://gitlab.alibaba-inc.com/Ali-MaaS/MaaS-lib into feat/backbone_head

* add a test repo for pr

* remove merge conflicted file

* remove merge conflicted file 1

* lint check

* import error

* none type bug fix

* forward input unpacking or dict bug

* move head into models, add build_backbone with registry, no base method

* merge master

* feat: 1. add interleave dataset method 2. support multiple dataset in trainer.build_dataset 3. support 3 sub tasks in sequence_classification task

* unfinished

* update the task model structure in NLP field

* merge master

* update by comments

* keep the default model id as current on production

* unfinished

* unfinished

* veco can run

* Merge remote-tracking branch 'origin/master' into feat/backbone_head

* add taskmodel for module management

* remove forward_input_is_dict

* unfinished

* token classification started

* update base model structure

* move space to backbone

* remove 'type' in build_from_cfg method

* test update

* bug fix

* on tesing, mess code

* Merge branch 'feat/backbone_head' into feat/refactor_nlp_730

# Conflicts:
#	modelscope/metrics/builder.py
#	modelscope/models/__init__.py
#	modelscope/models/nlp/__init__.py
#	modelscope/preprocessors/nlp.py
#	modelscope/trainers/trainer.py
#	requirements/multi-modal.txt

* add missing merge

* add sofa source code

* refactor

* add veco task dataset

* add veco task dataset

* pre-commit passed

* fix bug of log

* add some features

* merge master

* bug fix

* refine nlp models

* fix the training error

* unfinished

* refactor pipeline

* Merge branch 'feat/backbone_head' into feat/refactor_nlp_730

# Conflicts:
#	modelscope/metrics/builder.py
#	modelscope/models/nlp/__init__.py
#	modelscope/models/nlp/backbones/structbert/modeling_sbert.py
#	modelscope/models/nlp/palm_v2/palm_for_text_generation.py
#	modelscope/preprocessors/base.py
#	modelscope/preprocessors/nlp.py
#	modelscope/trainers/trainer.py

* Merge commit 'ab04ceafc5453ce7daa9aa09e37a55f703072a10' into feat/refactor_nlp_730

# Conflicts:
#	modelscope/metainfo.py
#	modelscope/metrics/builder.py
#	modelscope/models/__init__.py
#	modelscope/models/base/base_torch_model.py
#	modelscope/models/nlp/__init__.py
#	modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py
#	modelscope/models/nlp/backbones/space/model/model_base.py
#	modelscope/models/nlp/palm_v2/palm_for_text_generation.py
#	modelscope/models/nlp/sbert_for_sequence_classification.py
#	modelscope/models/nlp/sequence_classification.py
#	modelscope/models/nlp/space/__init__.py
#	modelscope/models/nlp/space_for_dialog_intent_prediction.py
#	modelscope/models/nlp/space_for_dialog_modeling.py
#	modelscope/models/nlp/space_for_dialog_state_tracking.py
#	modelscope/models/nlp/task_model.py
#	modelscope/pipelines/nlp/sentiment_classification_pipeline.py
#	modelscope/preprocessors/base.py
#	modelscope/preprocessors/nlp.py
#	modelscope/trainers/trainer.py

* revert changes

* unify sentnece classification postprocess

* revert some changes, move some model files

* pipeline first case run through

* ws pipeline passed

* Merge branch 'feat/refactor_nlp_730' into feat/finetune_sbert_veco

* finetune

* revert code

* revert some code

* ws finetune started, only the accuracy is weird

* Merge branch 'feat/veco_taskdataset' into feat/finetune_sbert_veco

# Conflicts:
#	modelscope/task_datasets/veco_dataset.py
#	tests/taskdataset/test_veco_dataset.py

* veco+nli finetune started

* Merge branch 'master' into feat/finetune_sbert_veco

# Conflicts:
#	modelscope/models/nlp/sbert_for_sequence_classification.py
#	modelscope/models/nlp/sbert_for_token_classification.py
#	modelscope/models/nlp/sbert_for_zero_shot_classification.py
#	modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
#	modelscope/models/nlp/space/space_for_dialog_modeling.py
#	modelscope/trainers/trainer.py

* add trainer for nlp

* trainer: dataset params passed into preprocessor

* test passed by nlptrainer

* fix some bugs

* fix some bugs

* add backbone/head subclass

* fix regression bugs

* fix bug in token-cls finetune

* support cfg modification

* fix bug

* fix bug

* update requirements

* add some comments and fix some t

* add some comments and revert a argument

* split to two test files

* revert code

* fixbug in precessor

(cherry picked from commit 7a648d096ef8500c694d3255dabe29e6f4bfc3e5)

* fix ut bug

* support sbert models

* unfinished

* Merge branch 'feat/finetune_sbert_veco' into sly_tmp_veco_finetune

# Conflicts:
#	tests/trainers/test_finetune_sequence_classification.py

* fixbug in veco

* fix bug

* fixbug

* correct running params

* remove useless files

* add palm finetuning with cnn_dailymail dataset

* copy space model from sofa

* Merge branch 'feat/finetune_sbert_veco' of gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib into feat/finetune_sbert_veco

* Merge branch 'master' into feat/finetune_sbert_veco

# Conflicts:
#	modelscope/metrics/__init__.py
#	modelscope/models/__init__.py
#	modelscope/models/nlp/__init__.py
#	modelscope/models/nlp/backbones/__init__.py
#	modelscope/models/nlp/backbones/structbert/modeling_sbert.py
#	modelscope/models/nlp/heads/__init__.py
#	modelscope/models/nlp/masked_language.py
#	modelscope/models/nlp/palm_v2/palm_for_text_generation.py
#	modelscope/models/nlp/sbert_for_nli.py
#	modelscope/models/nlp/sbert_for_sentence_similarity.py
#	modelscope/models/nlp/sbert_for_sentiment_classification.py
#	modelscope/models/nlp/sbert_for_sequence_classification.py
#	modelscope/models/nlp/sbert_for_token_classification.py
#	modelscope/models/nlp/sbert_for_zero_shot_classification.py
#	modelscope/models/nlp/sequence_classification.py
#	modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
#	modelscope/models/nlp/space/space_for_dialog_modeling.py
#	modelscope/models/nlp/space/space_for_dialog_state_tracking.py
#	modelscope/models/nlp/structbert/adv_utils.py
#	modelscope/models/nlp/structbert/configuration_sbert.py
#	modelscope/models/nlp/task_models/task_model.py
#	modelscope/pipelines/__init__.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/nli_pipeline.py
#	modelscope/pipelines/nlp/sentence_similarity_pipeline.py
#	modelscope/pipelines/nlp/sentiment_classification_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/preprocessors/nlp.py
#	modelscope/task_datasets/__init__.py
#	modelscope/trainers/trainer.py
#	modelscope/trainers/utils/inference.py
#	modelscope/utils/file_utils.py
#	requirements/nlp.txt
#	tests/pipelines/test_nli.py
#	tests/pipelines/test_sentence_similarity.py
#	tests/pipelines/test_sentiment_classification.py

* fix imports

* mark backbone in their own modeling

* pre-commit check passed

* pre-commit passed, remove roberta model

* fix a bug in ast import

* skip all finetune uts

* fix bugs

* pre-commit passed

* bug fixed

* bug fixed

* bug fixed

* bug fixed

* fix ut bug

* fix bug

* fix ut bug

* fix bug

* fix bug

* fixbugs

* fixbug

* revert veco

* revert veco because of core dump

* fix palm bug

* revert veco

* revert mistaken code

* add a test print

* pre-commit check

* test exception

* add test code

* for test

* fix bug and test

* remove test code

* remove useless file

* 1. fix some bugs 2. add backbone ut

* Merge branch 'master' into feat/finetune_refactor_730

# Conflicts:
#	modelscope/metainfo.py
#	modelscope/metrics/sequence_classification_metric.py
#	modelscope/models/nlp/__init__.py
#	modelscope/models/nlp/task_models/task_model.py
#	modelscope/preprocessors/__init__.py
#	modelscope/preprocessors/nlp.py
#	modelscope/trainers/trainer.py
#	modelscope/trainers/utils/inference.py
#	modelscope/utils/file_utils.py
#	tests/trainers/test_trainer_with_nlp.py

* pre-commit passed

* revert files

* increase test level

* unregister models

* fix bugs

* fix cr comments

* fix bug in backbone-head

* add sbert backbone

* fix bug

* add test for token-cls-metric

* pre-commit passed

* fix ut comments

* revert normal tokenizer to fast tokenizer

* Merge branch 'master' into feat/finetune_refactor_730

# Conflicts:
#	modelscope/models/nlp/__init__.py
#	modelscope/models/nlp/backbones/__init__.py
#	modelscope/models/nlp/backbones/structbert/__init__.py
#	modelscope/models/nlp/masked_language.py
#	modelscope/models/nlp/palm_v2/palm_for_text_generation.py
#	modelscope/models/nlp/sbert_for_sequence_classification.py
#	modelscope/models/nlp/sbert_for_token_classification.py
#	modelscope/models/nlp/sbert_for_zero_shot_classification.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/preprocessors/nlp.py
#	modelscope/trainers/trainer.py
#	modelscope/trainers/utils/inference.py

* fix merge bugs

* pre commit passed

* fix bug

* fix bug

* fix bug

* fix bug from master

* add print

* fix ut bug

* fix bug

* Merge branch 'master' into feat/finetune_refactor_730

* skip task model test
---
 configs/nlp/sbert_sentence_similarity.json    |    2 +-
 modelscope/hub/utils/utils.py                 |    2 +-
 modelscope/metainfo.py                        |   10 +-
 modelscope/metrics/__init__.py                |    2 +
 modelscope/metrics/base.py                    |    3 +
 modelscope/metrics/builder.py                 |    2 +
 .../metrics/sequence_classification_metric.py |    8 +-
 .../metrics/token_classification_metric.py    |  123 ++
 modelscope/models/base/base_model.py          |   17 +-
 modelscope/models/base/base_torch_model.py    |   11 +-
 modelscope/models/nlp/__init__.py             |   48 +-
 modelscope/models/nlp/backbones/__init__.py   |    4 -
 .../models/nlp/backbones/space/__init__.py    |    2 -
 .../nlp/backbones/space/model/__init__.py     |    3 -
 modelscope/models/nlp/backbones/structbert.py |   54 +
 .../nlp/backbones/structbert/__init__.py      |   19 -
 .../backbones/structbert/modeling_sbert.py    |  815 -------
 .../nlp/{backbones => }/gpt3/__init__.py      |    4 +-
 .../gpt3/configuration_gpt3.py                |    0
 .../{ => gpt3}/gpt3_for_text_generation.py    |    2 +-
 .../nlp/{backbones => }/gpt3/modeling_gpt3.py |    0
 modelscope/models/nlp/heads/__init__.py       |    4 +-
 .../nlp/heads/sequence_classification_head.py |    3 +-
 .../models/nlp/heads/torch_pretrain_head.py   |   26 +
 modelscope/models/nlp/masked_language.py      |  157 +-
 modelscope/models/nlp/palm_v2/__init__.py     |   43 +
 .../models/nlp/palm_v2/configuration_palm.py  |  116 +
 .../models/nlp/palm_v2/dureader_eval.py       |  872 ++++++++
 .../models/nlp/palm_v2/modeling_palm.py       | 1332 +++++++++++
 .../{ => palm_v2}/palm_for_text_generation.py |    4 +-
 modelscope/models/nlp/sbert_for_nli.py        |   23 -
 .../nlp/sbert_for_sentence_similarity.py      |   25 -
 .../nlp/sbert_for_sentiment_classification.py |   22 -
 .../nlp/sbert_for_sequence_classification.py  |   82 -
 .../nlp/sbert_for_token_classification.py     |   64 -
 .../nlp/sbert_for_zero_shot_classification.py |   50 -
 .../models/nlp/sequence_classification.py     |  221 +-
 modelscope/models/nlp/space/__init__.py       |   28 +
 modelscope/models/nlp/space/model/__init__.py |   10 +
 .../nlp/space/model/configuration_space.py    |   32 +
 .../space/model/gen_unified_transformer.py    |    0
 .../{backbones => }/space/model/generator.py  |    0
 .../space/model/intent_unified_transformer.py |    0
 .../{backbones => }/space/model/model_base.py |    0
 .../models/nlp/space/model/modeling_space.py  |  268 +++
 .../nlp/space/model/tokenization_space.py     |   29 +
 .../space/model/unified_transformer.py        |    7 +-
 .../{backbones => }/space/modules/__init__.py |    0
 .../{backbones => }/space/modules/embedder.py |    0
 .../space/modules/feedforward.py              |    0
 .../space/modules/functions.py                |    0
 .../space/modules/multihead_attention.py      |    0
 .../space/modules/transformer_block.py        |    0
 .../space_for_dialog_intent_prediction.py     |    2 +-
 .../{ => space}/space_for_dialog_modeling.py  |    2 +-
 .../space_for_dialog_state_tracking.py        |    2 +-
 modelscope/models/nlp/structbert/__init__.py  |   45 +
 .../{backbones => }/structbert/adv_utils.py   |    6 +-
 .../structbert/configuration_sbert.py         |   11 +-
 .../models/nlp/structbert/modeling_sbert.py   | 1964 +++++++++++++++++
 .../nlp/structbert/tokenization_sbert.py      |  516 +++++
 .../nlp/structbert/tokenization_sbert_fast.py |  200 ++
 modelscope/models/nlp/task_models/__init__.py |    0
 .../task_models/sequence_classification.py    |   86 +
 .../nlp/{ => task_models}/task_model.py       |   11 +-
 modelscope/models/nlp/token_classification.py |  147 ++
 modelscope/models/nlp/veco/__init__.py        |   43 +
 .../models/nlp/veco/configuration_veco.py     |   33 +
 modelscope/models/nlp/veco/modeling_veco.py   |  143 ++
 .../models/nlp/veco/tokenization_veco.py      |  321 +++
 .../models/nlp/veco/tokenization_veco_fast.py |  213 ++
 modelscope/msdatasets/ms_dataset.py           |    7 +
 modelscope/outputs.py                         |    1 +
 modelscope/pipelines/nlp/__init__.py          |   13 +-
 .../pipelines/nlp/fill_mask_pipeline.py       |   21 +-
 .../nlp/named_entity_recognition_pipeline.py  |   12 +-
 modelscope/pipelines/nlp/nli_pipeline.py      |   73 -
 .../pair_sentence_classification_pipeline.py  |   37 +
 .../nlp/sentence_similarity_pipeline.py       |   73 -
 .../nlp/sentiment_classification_pipeline.py  |   74 -
 .../sequence_classification_pipeline_base.py  |   60 +
 ...single_sentence_classification_pipeline.py |   35 +
 .../pipelines/nlp/text_generation_pipeline.py |    8 +-
 .../pipelines/nlp/translation_pipeline.py     |    4 +-
 .../nlp/word_segmentation_pipeline.py         |   36 +-
 .../nlp/zero_shot_classification_pipeline.py  |   27 +-
 modelscope/preprocessors/__init__.py          |   14 +-
 modelscope/preprocessors/base.py              |    4 +-
 modelscope/preprocessors/nlp.py               |  506 +++--
 .../dialog_state_tracking_preprocessor.py     |    2 +-
 modelscope/task_datasets/__init__.py          |    2 +
 modelscope/task_datasets/base.py              |    6 +-
 .../task_datasets/torch_base_dataset.py       |    6 +-
 modelscope/task_datasets/veco_dataset.py      |   76 +
 modelscope/trainers/__init__.py               |    1 +
 modelscope/trainers/hooks/evaluation_hook.py  |    1 +
 .../trainers/hooks/lr_scheduler_hook.py       |    8 +-
 modelscope/trainers/nlp_trainer.py            |  192 ++
 modelscope/trainers/trainer.py                |   58 +-
 modelscope/trainers/utils/inference.py        |   31 +-
 modelscope/utils/ast_utils.py                 |    2 +-
 modelscope/utils/{utils.py => file_utils.py}  |    2 +-
 modelscope/utils/hub.py                       |   10 +
 modelscope/utils/tensor_utils.py              |    2 +-
 requirements/nlp.txt                          |    2 +-
 requirements/runtime.txt                      |    2 +-
 tests/metrics/__init__.py                     |    0
 .../test_token_classification_metrics.py      |   44 +
 tests/models/test_base_torch.py               |    8 +-
 tests/pipelines/test_csanmt_translation.py    |    2 +-
 tests/pipelines/test_fill_mask.py             |    6 +-
 tests/pipelines/test_nli.py                   |   15 +-
 tests/pipelines/test_sentence_similarity.py   |   15 +-
 .../test_sentiment_classification.py          |   39 +-
 ...est_sentiment_classification_task_model.py |   70 +
 tests/pipelines/test_text_generation.py       |    2 +-
 tests/pipelines/test_word_segmentation.py     |    2 +-
 .../test_zero_shot_classification.py          |    4 +-
 tests/taskdataset/test_veco_dataset.py        |   35 +
 .../trainers/hooks/test_lr_scheduler_hook.py  |    1 +
 .../test_finetune_sequence_classification.py  |  244 ++
 .../test_finetune_token_classificatin.py      |  200 ++
 .../trainers/test_text_generation_trainer.py  |   55 +-
 tests/trainers/test_trainer_with_nlp.py       |    6 +-
 124 files changed, 8548 insertions(+), 1902 deletions(-)
 create mode 100644 modelscope/metrics/token_classification_metric.py
 delete mode 100644 modelscope/models/nlp/backbones/space/__init__.py
 delete mode 100644 modelscope/models/nlp/backbones/space/model/__init__.py
 create mode 100644 modelscope/models/nlp/backbones/structbert.py
 delete mode 100644 modelscope/models/nlp/backbones/structbert/__init__.py
 delete mode 100644 modelscope/models/nlp/backbones/structbert/modeling_sbert.py
 rename modelscope/models/nlp/{backbones => }/gpt3/__init__.py (76%)
 rename modelscope/models/nlp/{backbones => }/gpt3/configuration_gpt3.py (100%)
 rename modelscope/models/nlp/{ => gpt3}/gpt3_for_text_generation.py (97%)
 rename modelscope/models/nlp/{backbones => }/gpt3/modeling_gpt3.py (100%)
 create mode 100644 modelscope/models/nlp/heads/torch_pretrain_head.py
 create mode 100644 modelscope/models/nlp/palm_v2/__init__.py
 create mode 100644 modelscope/models/nlp/palm_v2/configuration_palm.py
 create mode 100644 modelscope/models/nlp/palm_v2/dureader_eval.py
 create mode 100644 modelscope/models/nlp/palm_v2/modeling_palm.py
 rename modelscope/models/nlp/{ => palm_v2}/palm_for_text_generation.py (96%)
 delete mode 100644 modelscope/models/nlp/sbert_for_nli.py
 delete mode 100644 modelscope/models/nlp/sbert_for_sentence_similarity.py
 delete mode 100644 modelscope/models/nlp/sbert_for_sentiment_classification.py
 delete mode 100644 modelscope/models/nlp/sbert_for_sequence_classification.py
 delete mode 100644 modelscope/models/nlp/sbert_for_token_classification.py
 delete mode 100644 modelscope/models/nlp/sbert_for_zero_shot_classification.py
 create mode 100644 modelscope/models/nlp/space/__init__.py
 create mode 100644 modelscope/models/nlp/space/model/__init__.py
 create mode 100644 modelscope/models/nlp/space/model/configuration_space.py
 rename modelscope/models/nlp/{backbones => }/space/model/gen_unified_transformer.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/model/generator.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/model/intent_unified_transformer.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/model/model_base.py (100%)
 create mode 100644 modelscope/models/nlp/space/model/modeling_space.py
 create mode 100644 modelscope/models/nlp/space/model/tokenization_space.py
 rename modelscope/models/nlp/{backbones => }/space/model/unified_transformer.py (97%)
 rename modelscope/models/nlp/{backbones => }/space/modules/__init__.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/modules/embedder.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/modules/feedforward.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/modules/functions.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/modules/multihead_attention.py (100%)
 rename modelscope/models/nlp/{backbones => }/space/modules/transformer_block.py (100%)
 rename modelscope/models/nlp/{ => space}/space_for_dialog_intent_prediction.py (97%)
 rename modelscope/models/nlp/{ => space}/space_for_dialog_modeling.py (97%)
 rename modelscope/models/nlp/{ => space}/space_for_dialog_state_tracking.py (97%)
 create mode 100644 modelscope/models/nlp/structbert/__init__.py
 rename modelscope/models/nlp/{backbones => }/structbert/adv_utils.py (96%)
 rename modelscope/models/nlp/{backbones => }/structbert/configuration_sbert.py (94%)
 create mode 100755 modelscope/models/nlp/structbert/modeling_sbert.py
 create mode 100644 modelscope/models/nlp/structbert/tokenization_sbert.py
 create mode 100644 modelscope/models/nlp/structbert/tokenization_sbert_fast.py
 create mode 100644 modelscope/models/nlp/task_models/__init__.py
 create mode 100644 modelscope/models/nlp/task_models/sequence_classification.py
 rename modelscope/models/nlp/{ => task_models}/task_model.py (98%)
 create mode 100644 modelscope/models/nlp/token_classification.py
 create mode 100644 modelscope/models/nlp/veco/__init__.py
 create mode 100644 modelscope/models/nlp/veco/configuration_veco.py
 create mode 100644 modelscope/models/nlp/veco/modeling_veco.py
 create mode 100644 modelscope/models/nlp/veco/tokenization_veco.py
 create mode 100644 modelscope/models/nlp/veco/tokenization_veco_fast.py
 delete mode 100644 modelscope/pipelines/nlp/nli_pipeline.py
 create mode 100644 modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/sentence_similarity_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/sentiment_classification_pipeline.py
 create mode 100644 modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
 create mode 100644 modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
 create mode 100644 modelscope/task_datasets/veco_dataset.py
 create mode 100644 modelscope/trainers/nlp_trainer.py
 rename modelscope/utils/{utils.py => file_utils.py} (96%)
 create mode 100644 tests/metrics/__init__.py
 create mode 100644 tests/metrics/test_token_classification_metrics.py
 create mode 100644 tests/pipelines/test_sentiment_classification_task_model.py
 create mode 100644 tests/taskdataset/test_veco_dataset.py
 create mode 100644 tests/trainers/test_finetune_sequence_classification.py
 create mode 100644 tests/trainers/test_finetune_token_classificatin.py

diff --git a/configs/nlp/sbert_sentence_similarity.json b/configs/nlp/sbert_sentence_similarity.json
index 1e2bdef5..9320e0d7 100644
--- a/configs/nlp/sbert_sentence_similarity.json
+++ b/configs/nlp/sbert_sentence_similarity.json
@@ -2,7 +2,7 @@
     "framework": "pytorch",
     "task": "sentence-similarity",
     "preprocessor": {
-      "type": "bert-seq-cls-tokenizer-finetune",
+      "type": "sen-sim-tokenizer",
       "first_sequence": "sentence1",
       "second_sequence": "sentence2"
     },
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 8f6e7483..fff88cca 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -4,7 +4,7 @@ from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
-from modelscope.utils.utils import get_default_cache_dir
+from modelscope.utils.file_utils import get_default_cache_dir
 
 
 def model_id_to_group_owner_name(model_id):
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 215233fe..e0326baa 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -53,6 +53,10 @@ class TaskModels(object):
 class Heads(object):
     # nlp heads
     text_classification = 'text-classification'
+    # mlm
+    bert_mlm = 'bert-mlm'
+    # roberta mlm
+    roberta_mlm = 'roberta-mlm'
 
 
 class Pipelines(object):
@@ -137,7 +141,7 @@ class Trainers(object):
         Holds the standard trainer name to use for identifying different trainer.
     This should be used to register trainers.
 
-        For a general Trainer, you can use easynlp-trainer/ofa-trainer/sofa-trainer.
+        For a general Trainer, you can use easynlp-trainer/ofa-trainer.
         For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
     """
 
@@ -179,6 +183,8 @@ class Preprocessors(object):
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
+    word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
+    fill_mask = 'fill-mask'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
@@ -204,7 +210,7 @@ class Metrics(object):
     # metric for image instance segmentation task
     image_ins_seg_coco_metric = 'image-ins-seg-coco-metric'
     # metrics for sequence classification task
-    seq_cls_metric = 'seq_cls_metric'
+    seq_cls_metric = 'seq-cls-metric'
     # metrics for token-classification task
     token_cls_metric = 'token-cls-metric'
     # metrics for text-generation task
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index c632a9bd..37f9bfec 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
     from .image_portrait_enhancement_metric import ImagePortraitEnhancementMetric
     from .sequence_classification_metric import SequenceClassificationMetric
     from .text_generation_metric import TextGenerationMetric
+    from .token_classification_metric import TokenClassificationMetric
 
 else:
     _import_structure = {
@@ -26,6 +27,7 @@ else:
         ['ImagePortraitEnhancementMetric'],
         'sequence_classification_metric': ['SequenceClassificationMetric'],
         'text_generation_metric': ['TextGenerationMetric'],
+        'token_classification_metric': ['TokenClassificationMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py
index 1b9db825..3a9d810f 100644
--- a/modelscope/metrics/base.py
+++ b/modelscope/metrics/base.py
@@ -10,6 +10,9 @@ class Metric(ABC):
     complex metrics for a specific task with or without other Metric subclasses.
     """
 
+    def __init__(self, trainer=None, *args, **kwargs):
+        self.trainer = trainer
+
     @abstractmethod
     def add(self, outputs: Dict, inputs: Dict):
         """ Append logits and labels within an eval loop.
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 4df856f2..bd20d37b 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -20,7 +20,9 @@ class MetricKeys(object):
 task_default_metrics = {
     Tasks.image_segmentation: [Metrics.image_ins_seg_coco_metric],
     Tasks.sentence_similarity: [Metrics.seq_cls_metric],
+    Tasks.nli: [Metrics.seq_cls_metric],
     Tasks.sentiment_classification: [Metrics.seq_cls_metric],
+    Tasks.token_classification: [Metrics.token_cls_metric],
     Tasks.text_generation: [Metrics.text_gen_metric],
     Tasks.image_denoising: [Metrics.image_denoise_metric],
     Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index dabdb725..04b0ee81 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -17,14 +17,14 @@ class SequenceClassificationMetric(Metric):
     """The metric computation class for sequence classification classes.
     """
 
-    label_name = 'labels'
-
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.preds = []
         self.labels = []
 
     def add(self, outputs: Dict, inputs: Dict):
-        ground_truths = inputs[self.label_name]
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
         eval_results = outputs[OutputKeys.LOGITS]
         self.preds.append(
             torch_nested_numpify(torch_nested_detach(eval_results)))
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
new file mode 100644
index 00000000..8606148e
--- /dev/null
+++ b/modelscope/metrics/token_classification_metric.py
@@ -0,0 +1,123 @@
+import importlib
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from ..metainfo import Metrics
+from ..utils.registry import default_group
+from ..utils.tensor_utils import torch_nested_detach, torch_nested_numpify
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.token_cls_metric)
+class TokenClassificationMetric(Metric):
+    """
+    The metric computation class for token-classification task.
+    Args:
+        return_entity_level_metrics (bool, *optional*):
+            Whether to return every label's detail metrics, default False.
+    """
+
+    def add(self, outputs: Dict, inputs: Dict):
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
+        eval_results = outputs[OutputKeys.LOGITS]
+        self.preds.append(
+            torch_nested_numpify(torch_nested_detach(eval_results)))
+        self.labels.append(
+            torch_nested_numpify(torch_nested_detach(ground_truths)))
+
+    def __init__(self, return_entity_level_metrics=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.return_entity_level_metrics = return_entity_level_metrics
+        self.preds = []
+        self.labels = []
+
+    def evaluate(self):
+        self.id2label = {
+            id: label
+            for label, id in self.trainer.label2id.items()
+        }
+        self.preds = np.concatenate(self.preds, axis=0)
+        self.labels = np.concatenate(self.labels, axis=0)
+        predictions = np.argmax(self.preds, axis=-1)
+
+        true_predictions = [[
+            self.id2label[p] for (p, lb) in zip(prediction, label)
+            if lb != -100
+        ] for prediction, label in zip(predictions, self.labels)]
+        true_labels = [[
+            self.id2label[lb] for (p, lb) in zip(prediction, label)
+            if lb != -100
+        ] for prediction, label in zip(predictions, self.labels)]
+
+        results = self._compute(
+            predictions=true_predictions, references=true_labels)
+        if self.return_entity_level_metrics:
+            final_results = {}
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f'{key}_{n}'] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                MetricKeys.PRECISION: results[MetricKeys.PRECISION],
+                MetricKeys.RECALL: results[MetricKeys.RECALL],
+                MetricKeys.F1: results[MetricKeys.F1],
+                MetricKeys.ACCURACY: results[MetricKeys.ACCURACY],
+            }
+
+    @staticmethod
+    def _compute(
+        predictions,
+        references,
+        suffix: bool = False,
+        scheme: Optional[str] = None,
+        mode: Optional[str] = None,
+        sample_weight: Optional[List[int]] = None,
+        zero_division: Union[str, int] = 'warn',
+    ):
+        from seqeval.metrics import accuracy_score, classification_report
+        if scheme is not None:
+            try:
+                scheme_module = importlib.import_module('seqeval.scheme')
+                scheme = getattr(scheme_module, scheme)
+            except AttributeError:
+                raise ValueError(
+                    f'Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}'
+                )
+        report = classification_report(
+            y_true=references,
+            y_pred=predictions,
+            suffix=suffix,
+            output_dict=True,
+            scheme=scheme,
+            mode=mode,
+            sample_weight=sample_weight,
+            zero_division=zero_division,
+        )
+        report.pop('macro avg')
+        report.pop('weighted avg')
+        overall_score = report.pop('micro avg')
+
+        scores = {
+            type_name: {
+                MetricKeys.PRECISION: score['precision'],
+                MetricKeys.RECALL: score['recall'],
+                MetricKeys.F1: score['f1-score'],
+                'number': score['support'],
+            }
+            for type_name, score in report.items()
+        }
+        scores[MetricKeys.PRECISION] = overall_score['precision']
+        scores[MetricKeys.RECALL] = overall_score['recall']
+        scores[MetricKeys.F1] = overall_score['f1-score']
+        scores[MetricKeys.ACCURACY] = accuracy_score(
+            y_true=references, y_pred=predictions)
+        return scores
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index fd556dd4..3b596769 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -10,6 +10,8 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
 from modelscope.utils.config import Config
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.file_utils import func_receive_dict_inputs
+from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -69,6 +71,7 @@ class Model(ABC):
     def from_pretrained(cls,
                         model_name_or_path: str,
                         revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                        cfg_dict: Config = None,
                         *model_args,
                         **kwargs):
         """ Instantiate a model from local directory or remote model repo. Note
@@ -87,25 +90,25 @@ class Model(ABC):
                 )
             local_model_dir = snapshot_download(model_name_or_path, revision)
         logger.info(f'initialize model from {local_model_dir}')
-        cfg = Config.from_file(
-            osp.join(local_model_dir, ModelFile.CONFIGURATION))
+        if cfg_dict is not None:
+            cfg = cfg_dict
+        else:
+            cfg = Config.from_file(
+                osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
-        assert hasattr(
-            cfg, 'pipeline'), 'pipeline config is missing from config file.'
-        pipeline_cfg = cfg.pipeline
         # TODO @wenmeng.zwm may should manually initialize model after model building
 
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
 
         model_cfg.model_dir = local_model_dir
-
         for k, v in kwargs.items():
             model_cfg[k] = v
         model = build_model(
             model_cfg, task_name=task_name, default_args=kwargs)
 
         # dynamically add pipeline info to model for pipeline inference
-        model.pipeline = pipeline_cfg
+        if hasattr(cfg, 'pipeline'):
+            model.pipeline = cfg.pipeline
         return model
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index 52d4460c..cfc88721 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Union
 import torch
 from torch import nn
 
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from .base_model import Model
 
@@ -20,6 +21,13 @@ class TorchModel(Model, torch.nn.Module):
         super().__init__(model_dir, *args, **kwargs)
         torch.nn.Module.__init__(self)
 
+    def __call__(self, input: Dict[str,
+                                   torch.Tensor]) -> Dict[str, torch.Tensor]:
+        if func_receive_dict_inputs(self.forward):
+            return self.postprocess(self.forward(input))
+        else:
+            return self.postprocess(self.forward(**input))
+
     def forward(self, inputs: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -50,6 +58,3 @@ class TorchModel(Model, torch.nn.Module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-
-    def compute_loss(self, outputs: Dict[str, Any], labels):
-        raise NotImplementedError()
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index f2219b0e..24e65ef1 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -4,32 +4,26 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .backbones import (SbertModel, SpaceGenerator, SpaceModelBase,
-                            GPT3Model)
+    from .backbones import SbertModel
     from .heads import SequenceClassificationHead
     from .bert_for_sequence_classification import BertForSequenceClassification
     from .csanmt_for_translation import CsanmtForTranslation
     from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                   BertForMaskedLM)
     from .nncrf_for_named_entity_recognition import TransformerCRFForNamedEntityRecognition
-    from .palm_for_text_generation import PalmForTextGeneration
-    from .sbert_for_nli import SbertForNLI
-    from .sbert_for_sentence_similarity import SbertForSentenceSimilarity
-    from .sbert_for_sentiment_classification import SbertForSentimentClassification
-    from .sbert_for_token_classification import SbertForTokenClassification
-    from .sbert_for_zero_shot_classification import SbertForZeroShotClassification
-    from .sequence_classification import SequenceClassificationModel
-    from .space_for_dialog_intent_prediction import SpaceForDialogIntent
-    from .space_for_dialog_modeling import SpaceForDialogModeling
-    from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
-    from .task_model import SingleBackboneTaskModelBase
+    from .palm_v2 import PalmForTextGeneration
+    from .token_classification import SbertForTokenClassification
+    from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
+    from .space import SpaceForDialogIntent
+    from .space import SpaceForDialogModeling
+    from .space import SpaceForDialogStateTracking
+    from .task_models.task_model import SingleBackboneTaskModelBase
     from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .gpt3_for_text_generation import GPT3ForTextGeneration
+    from .gpt3 import GPT3ForTextGeneration
 
 else:
     _import_structure = {
-        'backbones':
-        ['SbertModel', 'SpaceGenerator', 'SpaceModelBase', 'GPT3Model'],
+        'backbones': ['SbertModel'],
         'heads': ['SequenceClassificationHead'],
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
@@ -37,21 +31,17 @@ else:
         ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
         'nncrf_for_named_entity_recognition':
         ['TransformerCRFForNamedEntityRecognition'],
-        'palm_for_text_generation': ['PalmForTextGeneration'],
-        'sbert_for_nli': ['SbertForNLI'],
-        'sbert_for_sentence_similarity': ['SbertForSentenceSimilarity'],
-        'sbert_for_sentiment_classification':
-        ['SbertForSentimentClassification'],
-        'sbert_for_token_classification': ['SbertForTokenClassification'],
-        'sbert_for_zero_shot_classification':
-        ['SbertForZeroShotClassification'],
-        'sequence_classification': ['SequenceClassificationModel'],
-        'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
-        'space_for_dialog_modeling': ['SpaceForDialogModeling'],
-        'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
+        'palm_v2': ['PalmForTextGeneration'],
+        'token_classification': ['SbertForTokenClassification'],
+        'sequence_classification':
+        ['VecoForSequenceClassification', 'SbertForSequenceClassification'],
+        'space': [
+            'SpaceForDialogIntent', 'SpaceForDialogModeling',
+            'SpaceForDialogStateTracking'
+        ],
         'task_model': ['SingleBackboneTaskModelBase'],
         'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
+        'gpt3': ['GPT3ForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/__init__.py b/modelscope/models/nlp/backbones/__init__.py
index ffe8ac05..749cf995 100644
--- a/modelscope/models/nlp/backbones/__init__.py
+++ b/modelscope/models/nlp/backbones/__init__.py
@@ -4,14 +4,10 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .space import SpaceGenerator, SpaceModelBase
     from .structbert import SbertModel
-    from .gpt3 import GPT3Model
 else:
     _import_structure = {
-        'space': ['SpaceGenerator', 'SpaceModelBase'],
         'structbert': ['SbertModel'],
-        'gpt3': ['GPT3Model']
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/space/__init__.py b/modelscope/models/nlp/backbones/space/__init__.py
deleted file mode 100644
index a2be83ef..00000000
--- a/modelscope/models/nlp/backbones/space/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .model.generator import Generator as SpaceGenerator
-from .model.model_base import SpaceModelBase
diff --git a/modelscope/models/nlp/backbones/space/model/__init__.py b/modelscope/models/nlp/backbones/space/model/__init__.py
deleted file mode 100644
index 7e1b5264..00000000
--- a/modelscope/models/nlp/backbones/space/model/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .gen_unified_transformer import GenUnifiedTransformer
-from .intent_unified_transformer import IntentUnifiedTransformer
-from .unified_transformer import UnifiedTransformer
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
new file mode 100644
index 00000000..125db040
--- /dev/null
+++ b/modelscope/models/nlp/backbones/structbert.py
@@ -0,0 +1,54 @@
+from transformers import PreTrainedModel
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import BACKBONES
+from modelscope.models.nlp.structbert import SbertConfig
+from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform
+from modelscope.utils.constant import Fields
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
+class SbertModel(TorchModel, SbertModelTransform):
+
+    def __init__(self, model_dir=None, add_pooling_layer=True, **config):
+        """
+        Args:
+            model_dir (str, optional): The model checkpoint directory. Defaults to None.
+            add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
+        """
+        config = SbertConfig(**config)
+        super().__init__(model_dir)
+        self.config = config
+        SbertModelTransform.__init__(self, config, add_pooling_layer)
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return SbertModelTransform.forward(
+            self, input_ids, attention_mask, token_type_ids, position_ids,
+            head_mask, inputs_embeds, encoder_hidden_states,
+            encoder_attention_mask, past_key_values, use_cache,
+            output_attentions, output_hidden_states, return_dict)
diff --git a/modelscope/models/nlp/backbones/structbert/__init__.py b/modelscope/models/nlp/backbones/structbert/__init__.py
deleted file mode 100644
index 1d147730..00000000
--- a/modelscope/models/nlp/backbones/structbert/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import TYPE_CHECKING
-
-from modelscope.utils.import_utils import LazyImportModule
-
-if TYPE_CHECKING:
-    from .modeling_sbert import SbertModel
-else:
-    _import_structure = {'modeling_sbert': ['SbertModel']}
-
-    import sys
-
-    sys.modules[__name__] = LazyImportModule(
-        __name__,
-        globals()['__file__'],
-        _import_structure,
-        module_spec=__spec__,
-        extra_objects={},
-    )
diff --git a/modelscope/models/nlp/backbones/structbert/modeling_sbert.py b/modelscope/models/nlp/backbones/structbert/modeling_sbert.py
deleted file mode 100644
index 2e67a652..00000000
--- a/modelscope/models/nlp/backbones/structbert/modeling_sbert.py
+++ /dev/null
@@ -1,815 +0,0 @@
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from packaging import version
-from torch import nn
-from transformers import PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions, ModelOutput)
-from transformers.modeling_utils import (apply_chunking_to_forward,
-                                         find_pruneable_heads_and_indices,
-                                         prune_linear_layer)
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import BACKBONES
-from modelscope.utils.constant import Fields
-from modelscope.utils.logger import get_logger
-from .configuration_sbert import SbertConfig
-
-logger = get_logger(__name__)
-
-
-@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
-class SbertModel(TorchModel, PreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
-    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    input to the forward pass.
-    """
-
-    def __init__(self, model_dir=None, add_pooling_layer=True, **config):
-        """
-        Args:
-            model_dir (str, optional): The model checkpoint directory. Defaults to None.
-            add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
-        """
-        config = SbertConfig(**config)
-        super().__init__(model_dir)
-        self.config = config
-
-        self.embeddings = SbertEmbeddings(config)
-        self.encoder = SbertEncoder(config)
-
-        self.pooler = SbertPooler(config) if add_pooling_layer else None
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`
-        , `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-        with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads,
-        sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else
-            self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[
-            2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                ((batch_size, seq_length + past_key_values_length)),
-                device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, 'token_type_ids'):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
-            )
-            encoder_hidden_shape = (encoder_batch_size,
-                                    encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask,
-                                       self.config.num_hidden_layers)
-
-        embedding_output, orignal_embeds = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-            return_inputs_embeds=True,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(
-            sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output,
-                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
-
-        return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-            embedding_output=orignal_embeds)
-
-    def extract_sequence_outputs(self, outputs):
-        return outputs['last_hidden_state']
-
-    def extract_pooled_outputs(self, outputs):
-        return outputs['pooler_output']
-
-
-class SbertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        self.register_buffer(
-            'position_ids',
-            torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if version.parse(torch.__version__) > version.parse('1.6.0'):
-            self.register_buffer(
-                'token_type_ids',
-                torch.zeros(
-                    self.position_ids.size(),
-                    dtype=torch.long,
-                    device=self.position_ids.device),
-                persistent=False,
-            )
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                position_ids=None,
-                inputs_embeds=None,
-                past_key_values_length=0,
-                return_inputs_embeds=False):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:,
-                                             past_key_values_length:seq_length
-                                             + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, 'token_type_ids'):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape,
-                    dtype=torch.long,
-                    device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == 'absolute':
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        if not return_inputs_embeds:
-            return embeddings
-        else:
-            return embeddings, inputs_embeds
-
-
-class SbertSelfAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, 'embedding_size'):
-            raise ValueError(
-                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
-                f'heads ({config.num_attention_heads})')
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size
-                                       / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(
-                dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == 'relative_key':
-                relative_position_scores = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == 'relative_key_query':
-                relative_position_scores_query = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum(
-                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value, )
-        return outputs
-
-
-class SbertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.self = SbertSelfAttention(config)
-        self.output = SbertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads,
-            self.self.attention_head_size, self.pruned_heads)
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class SbertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class SbertOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertLayer(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = SbertAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(
-                    f'{self} should be used as a decoder model if cross attention is added'
-                )
-            self.crossattention = SbertAttention(config)
-        self.intermediate = SbertIntermediate(config)
-        self.output = SbertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:
-                                                  2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, 'crossattention'):
-                raise ValueError(
-                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated'
-                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[
-                -2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[
-                1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
-                                                 self.chunk_size_feed_forward,
-                                                 self.seq_len_dim,
-                                                 attention_output)
-        outputs = (layer_output, ) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value, )
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class SbertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-        ) if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    layer_outputs[1], )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class SbertPooler(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-@dataclass
-class SbertForPreTrainingOutput(ModelOutput):
-    """
-    Output type of :class:`~structbert.utils.BertForPreTraining`.
-
-    Args:
-        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when
-        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when
-        ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-        BaseModelOutputWithPoolingAndCrossAttentions):
-    embedding_output: torch.FloatTensor = None
-    logits: Optional[Union[tuple, torch.FloatTensor]] = None
-    kwargs: dict = None
diff --git a/modelscope/models/nlp/backbones/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py
similarity index 76%
rename from modelscope/models/nlp/backbones/gpt3/__init__.py
rename to modelscope/models/nlp/gpt3/__init__.py
index b0739c22..076a0c6b 100644
--- a/modelscope/models/nlp/backbones/gpt3/__init__.py
+++ b/modelscope/models/nlp/gpt3/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .configuration_gpt3 import GPT3Config
     from .modeling_gpt3 import GPT3Model
+    from .gpt3_for_text_generation import GPT3ForTextGeneration
 else:
     _import_structure = {
         'configuration_gpt3': ['GPT3Config'],
-        'modeling_gpt3': ['GPT3Model']
+        'modeling_gpt3': ['GPT3Model'],
+        'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration_gpt3.py
similarity index 100%
rename from modelscope/models/nlp/backbones/gpt3/configuration_gpt3.py
rename to modelscope/models/nlp/gpt3/configuration_gpt3.py
diff --git a/modelscope/models/nlp/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
similarity index 97%
rename from modelscope/models/nlp/gpt3_for_text_generation.py
rename to modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
index 22a6458d..6bdcb431 100644
--- a/modelscope/models/nlp/gpt3_for_text_generation.py
+++ b/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
@@ -20,7 +20,7 @@ class GPT3ForTextGeneration(TorchModel):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        from modelscope.models.nlp import GPT3Model
+        from modelscope.models.nlp.gpt3 import GPT3Model
         from transformers import BertTokenizer
 
         self.model = GPT3Model.from_pretrained(model_dir)
diff --git a/modelscope/models/nlp/backbones/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
similarity index 100%
rename from modelscope/models/nlp/backbones/gpt3/modeling_gpt3.py
rename to modelscope/models/nlp/gpt3/modeling_gpt3.py
diff --git a/modelscope/models/nlp/heads/__init__.py b/modelscope/models/nlp/heads/__init__.py
index 6ae43f6d..19194d3a 100644
--- a/modelscope/models/nlp/heads/__init__.py
+++ b/modelscope/models/nlp/heads/__init__.py
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .sequence_classification_head import SequenceClassificationHead
+    from .torch_pretrain_head import BertMLMHead, RobertaMLMHead
 else:
     _import_structure = {
-        'sequence_classification_head': ['SequenceClassificationHead']
+        'sequence_classification_head': ['SequenceClassificationHead'],
+        'torch_pretrain_head': ['BertMLMHead', 'RobertaMLMHead'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/heads/sequence_classification_head.py b/modelscope/models/nlp/heads/sequence_classification_head.py
index 8c6e2188..92f3a4ec 100644
--- a/modelscope/models/nlp/heads/sequence_classification_head.py
+++ b/modelscope/models/nlp/heads/sequence_classification_head.py
@@ -1,5 +1,4 @@
-import importlib
-from typing import Dict, List, Optional, Union
+from typing import Dict
 
 import torch
 import torch.nn.functional as F
diff --git a/modelscope/models/nlp/heads/torch_pretrain_head.py b/modelscope/models/nlp/heads/torch_pretrain_head.py
new file mode 100644
index 00000000..6ff6c96f
--- /dev/null
+++ b/modelscope/models/nlp/heads/torch_pretrain_head.py
@@ -0,0 +1,26 @@
+from typing import Dict
+
+import torch
+from transformers.models.bert.modeling_bert import BertOnlyMLMHead
+from transformers.models.roberta.modeling_roberta import RobertaLMHead
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.bert_mlm)
+class BertMLMHead(BertOnlyMLMHead, TorchHead):
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+
+@HEADS.register_module(Tasks.fill_mask, module_name=Heads.roberta_mlm)
+class RobertaMLMHead(RobertaLMHead, TorchHead):
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index ffe9631d..ff16335f 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -1,72 +1,115 @@
-from typing import Dict
+from typing import Any, Dict, Optional, Union
 
 import numpy as np
+from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 
 from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
+from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertForMaskedLM
+from modelscope.models.nlp.veco import \
+    VecoForMaskedLM as VecoForMaskedLMTransformer
+from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
 __all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']
 
 
-class MaskedLanguageModelBase(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-        self.model = self.build_model()
-
-    def build_model(self):
-        raise NotImplementedError()
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    @property
-    def config(self):
-        if hasattr(self.model, 'config'):
-            return self.model.config
-        return None
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-        """
-        rst = self.model(
-            input_ids=input['input_ids'],
-            attention_mask=input['attention_mask'],
-            token_type_ids=input['token_type_ids'])
-        return {'logits': rst['logits'], 'input_ids': input['input_ids']}
-
-
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
-class StructBertForMaskedLM(MaskedLanguageModelBase):
-
-    def build_model(self):
-        from sofa import SbertForMaskedLM
-        return SbertForMaskedLM.from_pretrained(self.model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
-class VecoForMaskedLM(MaskedLanguageModelBase):
-
-    def build_model(self):
-        from sofa import VecoForMaskedLM
-        return VecoForMaskedLM.from_pretrained(self.model_dir)
+class StructBertForMaskedLM(TorchModel, SbertForMaskedLM):
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        SbertForMaskedLM.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = SbertForMaskedLM.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained(
+            pretrained_model_name_or_path=model_dir, model_dir=model_dir)
 
 
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
-class BertForMaskedLM(MaskedLanguageModelBase):
+class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer):
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        BertForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = BertForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(BertForMaskedLMTransformer,
+                     BertForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
 
-    def build_model(self):
-        from transformers import BertForMaskedLM
-        return BertForMaskedLM.from_pretrained(self.model_dir)
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
+class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        VecoForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = VecoForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(VecoForMaskedLMTransformer,
+                     VecoForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py
new file mode 100644
index 00000000..3a9960ec
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_palm import PalmConfig
+    from .modeling_palm import (
+        AbsSummarizer,
+        PalmForConditionalGeneration,
+        Translator,
+    )
+    from .palm_for_text_generation import PalmForTextGeneration
+else:
+    _import_structure = {
+        'configuration_palm': ['PalmConfig'],
+        'modeling_palm':
+        ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'],
+        'palm_for_text_generation': ['PalmForTextGeneration'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/palm_v2/configuration_palm.py b/modelscope/models/nlp/palm_v2/configuration_palm.py
new file mode 100644
index 00000000..3b9e51fb
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/configuration_palm.py
@@ -0,0 +1,116 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PALM model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class PalmConfig(PretrainedConfig):
+    r"""
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layernorm_epsilon (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dec_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer decoder.
+        attn_separate (:obj:`bool`, `optional`, defaults to false):
+            Whether or not to separate the q, k, v of attention.
+
+    Examples::
+
+        >>> from modelscope.models.nlp.palm_v2 import PalmForConditionalGeneration, PalmConfig
+        >>> configuration = PalmConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = PalmForConditionalGeneration(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = 'palm'
+
+    def __init__(self,
+                 encoder='roberta',
+                 encoder_pth='roberta-base',
+                 max_pos=512,
+                 share_emb=False,
+                 dec_layers=12,
+                 dec_hidden_size=768,
+                 dec_heads=8,
+                 dec_ff_size=3072,
+                 dec_dropout=0.2,
+                 use_bert_emb=True,
+                 label_smoothing=0.1,
+                 alpha=0.95,
+                 beam_size=5,
+                 min_length=40,
+                 max_length=130,
+                 sample_topk=False,
+                 block_trigram=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.encoder_pth = encoder_pth
+        self.max_pos = max_pos
+        self.share_emb = share_emb
+        self.dec_layers = dec_layers
+        self.dec_hidden_size = dec_hidden_size
+        self.dec_heads = dec_heads
+        self.dec_ff_size = dec_ff_size
+        self.dec_dropout = dec_dropout
+        self.use_bert_emb = use_bert_emb
+        self.label_smoothing = label_smoothing
+        # Translator
+        self.alpha = alpha
+        self.beam_size = beam_size
+        self.min_length = min_length
+        self.max_length = max_length
+        self.sample_topk = sample_topk
+        self.block_trigram = block_trigram
diff --git a/modelscope/models/nlp/palm_v2/dureader_eval.py b/modelscope/models/nlp/palm_v2/dureader_eval.py
new file mode 100644
index 00000000..db54f21d
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/dureader_eval.py
@@ -0,0 +1,872 @@
+# ==============================================================================
+# Copyright 2017 Baidu.com, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+This module computes evaluation metrics for DuReader dataset.
+"""
+
+import argparse
+import copy
+import math
+import re
+import sys
+import zipfile
+from collections import Counter, defaultdict
+
+import json
+import numpy as np
+from rouge import Rouge
+
+EMPTY = ''
+YESNO_LABELS = set(['Yes', 'No', 'Depends'])
+
+
+def my_lcs(string, sub):
+    """
+    Calculates longest common subsequence for a pair of tokenized strings
+    :param string : list of str : tokens from a string split using whitespace
+    :param sub : list of str : shorter string, also split using whitespace
+    :returns: length (list of int): length of the longest common subsequence between the two strings
+
+    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
+    """
+    if (len(string) < len(sub)):
+        sub, string = string, sub
+
+    lengths = [[0 for i in range(0,
+                                 len(sub) + 1)]
+               for j in range(0,
+                              len(string) + 1)]
+
+    for j in range(1, len(sub) + 1):
+        for i in range(1, len(string) + 1):
+            if (string[i - 1] == sub[j - 1]):
+                lengths[i][j] = lengths[i - 1][j - 1] + 1
+            else:
+                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
+
+    return lengths[len(string)][len(sub)]
+
+
+class Bleu:
+
+    def __init__(self, n=4):
+        # default compute Blue score up to 4
+        self._n = n
+        self._hypo_for_image = {}
+        self.ref_for_image = {}
+
+    def compute_score(self, gts, res):
+        assert (list(gts.keys()) == list(res.keys()))
+        imgIds = list(gts.keys())
+
+        bleu_scorer = BleuScorer(n=self._n)
+        for id in imgIds:
+            hypo = res[id]
+            ref = gts[id]
+
+            # Sanity check.
+            assert (type(hypo) is list)
+            assert (len(hypo) == 1)
+            assert (type(ref) is list)
+            assert (len(ref) >= 1)
+
+            bleu_scorer += (hypo[0], ref)
+
+        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
+        return score, scores
+
+    def method(self):
+        return 'Bleu'
+
+
+def precook(s, n=4, out=False):
+    """Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well."""
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i:i + k])
+            counts[ngram] += 1
+    return (len(words), counts)
+
+
+def cook_refs(refs, eff=None, n=4):  # lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+
+    reflen = []
+    maxcounts = {}
+    for ref in refs:
+        rl, counts = precook(ref, n)
+        reflen.append(rl)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
+
+    # Calculate effective reference sentence length.
+    if eff == 'shortest':
+        reflen = min(reflen)
+    elif eff == 'average':
+        reflen = float(sum(reflen)) / len(reflen)
+
+    # lhuang: N.B.: leave reflen computaiton to the very end!!
+
+    # lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
+
+    return reflen, maxcounts
+
+
+def cook_test(test, xxx_todo_changeme, eff=None, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+    (reflen, refmaxcounts) = xxx_todo_changeme
+    testlen, counts = precook(test, n, True)
+
+    result = {}
+
+    # Calculate effective reference sentence length.
+
+    if eff == 'closest':
+        result['reflen'] = min((abs(ref - testlen), ref) for ref in reflen)[1]
+    else:  # i.e., "average" or "shortest" or None
+        result['reflen'] = reflen
+
+    result['testlen'] = testlen
+
+    result['guess'] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
+
+    result['correct'] = [0] * n
+    for (ngram, count) in counts.items():
+        result['correct'][len(ngram) - 1] += min(
+            refmaxcounts.get(ngram, 0), count)
+
+    return result
+
+
+class BleuScorer(object):
+    """Bleu scorer.
+    """
+
+    __slots__ = 'n', 'crefs', 'ctest', '_score', '_ratio', '_testlen', '_reflen', 'special_reflen'
+
+    # special_reflen is used in oracle (proportional effective ref len for a node).
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = BleuScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        new._score = None
+        return new
+
+    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
+        ''' singular instance '''
+
+        self.n = n
+        self.crefs = []
+        self.ctest = []
+        self.cook_append(test, refs)
+        self.special_reflen = special_reflen
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                cooked_test = cook_test(test, self.crefs[-1])
+                self.ctest.append(cooked_test)  # N.B.: -1
+            else:
+                self.ctest.append(
+                    None)  # lens of crefs and ctest have to match
+
+        self._score = None  # need to recompute
+
+    def ratio(self, option=None):
+        self.compute_score(option=option)
+        return self._ratio
+
+    def score_ratio(self, option=None):
+        '''return (bleu, len_ratio) pair'''
+        return (self.fscore(option=option), self.ratio(option=option))
+
+    def score_ratio_str(self, option=None):
+        return '%.4f (%.2f)' % self.score_ratio(option)
+
+    def reflen(self, option=None):
+        self.compute_score(option=option)
+        return self._reflen
+
+    def testlen(self, option=None):
+        self.compute_score(option=option)
+        return self._testlen
+
+    def retest(self, new_test):
+        if type(new_test) is str:
+            new_test = [new_test]
+        assert len(new_test) == len(self.crefs), new_test
+        self.ctest = []
+        for t, rs in zip(new_test, self.crefs):
+            self.ctest.append(cook_test(t, rs))
+        self._score = None
+
+        return self
+
+    def rescore(self, new_test):
+        ''' replace test(s) with new test(s), and returns the new score.'''
+
+        return self.retest(new_test).compute_score()
+
+    def size(self):
+        assert len(self.crefs) == len(
+            self.ctest), 'refs/test mismatch! %d<>%d' % (len(
+                self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            # avoid creating new BleuScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            assert self.compatible(other), 'incompatible BLEUs.'
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+            self._score = None  # need to recompute
+
+        return self
+
+    def compatible(self, other):
+        return isinstance(other, BleuScorer) and self.n == other.n
+
+    def single_reflen(self, option='average'):
+        return self._single_reflen(self.crefs[0][0], option)
+
+    def _single_reflen(self, reflens, option=None, testlen=None):
+
+        if option == 'shortest':
+            reflen = min(reflens)
+        elif option == 'average':
+            reflen = float(sum(reflens)) / len(reflens)
+        elif option == 'closest':
+            reflen = min((abs(ref - testlen), ref) for ref in reflens)[1]
+        else:
+            assert False, 'unsupported reflen option %s' % option
+
+        return reflen
+
+    def recompute_score(self, option=None, verbose=0):
+        self._score = None
+        return self.compute_score(option, verbose)
+
+    def compute_score(self, option=None, verbose=0):
+        n = self.n
+        small = 1e-9
+        tiny = 1e-15  # so that if guess is 0 still return 0
+        bleu_list = [[] for _ in range(n)]
+
+        if self._score is not None:
+            return self._score
+
+        if option is None:
+            option = 'average' if len(self.crefs) == 1 else 'closest'
+
+        self._testlen = 0
+        self._reflen = 0
+        totalcomps = {
+            'testlen': 0,
+            'reflen': 0,
+            'guess': [0] * n,
+            'correct': [0] * n
+        }
+
+        # for each sentence
+        for comps in self.ctest:
+            testlen = comps['testlen']
+            self._testlen += testlen
+
+            if self.special_reflen is None:  # need computation
+                reflen = self._single_reflen(comps['reflen'], option, testlen)
+            else:
+                reflen = self.special_reflen
+
+            self._reflen += reflen
+
+            for key in ['guess', 'correct']:
+                for k in range(n):
+                    totalcomps[key][k] += comps[key][k]
+
+            # append per image bleu score
+            bleu = 1.
+            for k in range(n):
+                bleu *= (float(comps['correct'][k]) + tiny) / (
+                    float(comps['guess'][k]) + small)
+                bleu_list[k].append(bleu**(1. / (k + 1)))
+            ratio = (testlen + tiny) / (reflen + small
+                                        )  # N.B.: avoid zero division
+            if ratio < 1:
+                for k in range(n):
+                    bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
+
+            if verbose > 1:
+                print(comps, reflen)
+
+        totalcomps['reflen'] = self._reflen
+        totalcomps['testlen'] = self._testlen
+
+        bleus = []
+        bleu = 1.
+        for k in range(n):
+            bleu *= float(totalcomps['correct'][k] + tiny) / (
+                totalcomps['guess'][k] + small)
+            bleus.append(bleu**(1. / (k + 1)))
+        ratio = (self._testlen + tiny) / (self._reflen + small
+                                          )  # N.B.: avoid zero division
+        if ratio < 1:
+            for k in range(n):
+                bleus[k] *= math.exp(1 - 1 / ratio)
+
+        if verbose > 0:
+            print(totalcomps)
+            print('ratio:', ratio)
+
+        self._score = bleus
+        return self._score, bleu_list
+
+
+def normalize(s):
+    """
+    Normalize strings to space joined chars.
+
+    Args:
+        s: a list of strings.
+
+    Returns:
+        A list of normalized strings.
+    """
+    if not s:
+        return s
+    normalized = []
+    for ss in s:
+        tokens = [c for c in list(ss) if len(c.strip()) != 0]
+        normalized.append(' '.join(tokens))
+    return normalized
+
+
+def data_check(obj, task):
+    """
+    Check data.
+
+    Raises:
+        Raises AssertionError when data is not legal.
+    """
+    assert 'question_id' in obj, "Missing 'question_id' field."
+    assert 'question_type' in obj, \
+        "Missing 'question_type' field. question_id: {}".format(obj['question_type'])
+
+    assert 'yesno_answers' in obj, \
+        "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id'])
+    assert isinstance(obj['yesno_answers'], list), \
+        r"""'yesno_answers' field must be a list, if the 'question_type' is not
+            'YES_NO', then this field should be an empty list.
+            question_id: {}""".format(obj['question_id'])
+
+    assert 'entity_answers' in obj, \
+        "Missing 'entity_answers' field. question_id: {}".format(obj['question_id'])
+    assert isinstance(
+        obj['entity_answers'],
+        list) and len(obj['entity_answers']) > 0, r"""'entity_answers' field
+            must be a list, and has at least one element, which can be a empty list.
+            question_id: {}""".format(obj['question_id'])
+
+
+def read_file(file_name, task, is_ref=False):
+    """
+    Read predict answers or reference answers from file.
+
+    Args:
+        file_name: the name of the file containing predict result or reference
+                   result.
+
+    Returns:
+        A dictionary mapping question_id to the result information. The result
+        information itself is also a dictionary with has four keys:
+        - question_type: type of the query.
+        - yesno_answers: A list of yesno answers corresponding to 'answers'.
+        - answers: A list of predicted answers.
+        - entity_answers: A list, each element is also a list containing the entities
+                    tagged out from the corresponding answer string.
+    """
+
+    def _open(file_name, mode, zip_obj=None):
+        if zip_obj is not None:
+            return zip_obj.open(file_name, mode)
+        return open(file_name, mode)
+
+    results = {}
+    keys = ['answers', 'yesno_answers', 'entity_answers', 'question_type']
+    if is_ref:
+        keys += ['source']
+
+    zf = zipfile.ZipFile(file_name,
+                         'r') if file_name.endswith('.zip') else None
+    file_list = [file_name] if zf is None else zf.namelist()
+
+    for fn in file_list:
+        for line in _open(fn, 'r', zip_obj=zf):
+            try:
+                obj = json.loads(line.strip())
+            except ValueError:
+                raise ValueError('Every line of data should be legal json')
+            data_check(obj, task)
+            qid = obj['question_id']
+            assert qid not in results, 'Duplicate question_id: {}'.format(qid)
+            results[qid] = {}
+            for k in keys:
+                results[qid][k] = obj[k]
+    return results
+
+
+def compute_bleu_rouge(pred_dict, ref_dict, bleu_order=4):
+    """
+    Compute bleu and rouge scores.
+    """
+    assert set(pred_dict.keys()) == set(ref_dict.keys()), \
+        'missing keys: {}'.format(set(ref_dict.keys()) - set(pred_dict.keys()))
+    scores = {}
+    bleu_scores, _ = Bleu(bleu_order).compute_score(ref_dict, pred_dict)
+    for i, bleu_score in enumerate(bleu_scores):
+        scores['Bleu-%d' % (i + 1)] = bleu_score
+    # rouge_score, _ = Rouge().compute_score(ref_dict, pred_dict)
+    rouge_score = Rouge().get_scores(
+        list(map(lambda x: x[0], pred_dict.values())),
+        list(map(lambda x: x[0], ref_dict.values())))
+    rouge_score = sum([d['rouge-l']['f']
+                       for d in rouge_score]) / len(rouge_score)
+    scores['Rouge-L'] = rouge_score
+    return scores
+
+
+def local_prf(pred_list, ref_list):
+    """
+    Compute local precision recall and f1-score,
+    given only one prediction list and one reference list
+    """
+    common = Counter(pred_list) & Counter(ref_list)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0, 0, 0
+    p = 1.0 * num_same / len(pred_list)
+    r = 1.0 * num_same / len(ref_list)
+    f1 = (2 * p * r) / (p + r)
+    return p, r, f1
+
+
+def compute_prf(pred_dict, ref_dict):
+    """
+    Compute precision recall and f1-score.
+    """
+    # pred_question_ids = set(pred_dict.keys())
+    ref_question_ids = set(ref_dict.keys())
+    correct_preds, total_correct, total_preds = 0, 0, 0
+    for question_id in ref_question_ids:
+        pred_entity_list = pred_dict.get(question_id, [[]])
+        assert len(pred_entity_list) == 1, \
+            'the number of entity list for question_id {} is not 1.'.format(question_id)
+        pred_entity_list = pred_entity_list[0]
+        all_ref_entity_lists = ref_dict[question_id]
+        best_local_f1 = 0
+        best_ref_entity_list = None
+        for ref_entity_list in all_ref_entity_lists:
+            local_f1 = local_prf(pred_entity_list, ref_entity_list)[2]
+            if local_f1 > best_local_f1:
+                best_ref_entity_list = ref_entity_list
+                best_local_f1 = local_f1
+        if best_ref_entity_list is None:
+            if len(all_ref_entity_lists) > 0:
+                best_ref_entity_list = sorted(
+                    all_ref_entity_lists, key=lambda x: len(x))[0]
+            else:
+                best_ref_entity_list = []
+        gold_entities = set(best_ref_entity_list)
+        pred_entities = set(pred_entity_list)
+        correct_preds += len(gold_entities & pred_entities)
+        total_preds += len(pred_entities)
+        total_correct += len(gold_entities)
+    p = float(correct_preds) / total_preds if correct_preds > 0 else 0
+    r = float(correct_preds) / total_correct if correct_preds > 0 else 0
+    f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
+    return {'Precision': p, 'Recall': r, 'F1': f1}
+
+
+def prepare_prf(pred_dict, ref_dict):
+    """
+    Prepares data for calculation of prf scores.
+    """
+    preds = {k: v['entity_answers'] for k, v in pred_dict.items()}
+    refs = {k: v['entity_answers'] for k, v in ref_dict.items()}
+    return preds, refs
+
+
+def filter_dict(result_dict, key_tag):
+    """
+    Filter a subset of the result_dict, where keys ends with 'key_tag'.
+    """
+    filtered = {}
+    for k, v in result_dict.items():
+        if k.endswith(key_tag):
+            filtered[k] = v
+    return filtered
+
+
+def get_metrics(pred_result, ref_result, task, source):
+    """
+    Computes metrics.
+    """
+    metrics = {}
+
+    ref_result_filtered = {}
+    pred_result_filtered = {}
+    if source == 'both':
+        ref_result_filtered = ref_result
+        pred_result_filtered = pred_result
+    else:
+        for question_id, info in ref_result.items():
+            if info['source'] == source:
+                ref_result_filtered[question_id] = info
+                if question_id in pred_result:
+                    pred_result_filtered[question_id] = pred_result[
+                        question_id]
+
+    if task == 'main' or task == 'all' \
+            or task == 'description':
+        pred_dict, ref_dict = prepare_bleu(pred_result_filtered,
+                                           ref_result_filtered, task)
+        metrics = compute_bleu_rouge(pred_dict, ref_dict)
+    elif task == 'yesno':
+        pred_dict, ref_dict = prepare_bleu(pred_result_filtered,
+                                           ref_result_filtered, task)
+        keys = ['Yes', 'No', 'Depends']
+        preds = [filter_dict(pred_dict, k) for k in keys]
+        refs = [filter_dict(ref_dict, k) for k in keys]
+
+        metrics = compute_bleu_rouge(pred_dict, ref_dict)
+
+        for k, pred, ref in zip(keys, preds, refs):
+            m = compute_bleu_rouge(pred, ref)
+            k_metric = [(k + '|' + key, v) for key, v in m.items()]
+            metrics.update(k_metric)
+
+    elif task == 'entity':
+        pred_dict, ref_dict = prepare_prf(pred_result_filtered,
+                                          ref_result_filtered)
+        pred_dict_bleu, ref_dict_bleu = prepare_bleu(pred_result_filtered,
+                                                     ref_result_filtered, task)
+        metrics = compute_prf(pred_dict, ref_dict)
+        metrics.update(compute_bleu_rouge(pred_dict_bleu, ref_dict_bleu))
+    else:
+        raise ValueError('Illegal task name: {}'.format(task))
+
+    return metrics
+
+
+def prepare_bleu(pred_result, ref_result, task):
+    """
+    Prepares data for calculation of bleu and rouge scores.
+    """
+    pred_list, ref_list = [], []
+    qids = ref_result.keys()
+    for qid in qids:
+        if task == 'main':
+            pred, ref = get_main_result(qid, pred_result, ref_result)
+        elif task == 'yesno':
+            pred, ref = get_yesno_result(qid, pred_result, ref_result)
+        elif task == 'all':
+            pred, ref = get_all_result(qid, pred_result, ref_result)
+        elif task == 'entity':
+            pred, ref = get_entity_result(qid, pred_result, ref_result)
+        elif task == 'description':
+            pred, ref = get_desc_result(qid, pred_result, ref_result)
+        else:
+            raise ValueError('Illegal task name: {}'.format(task))
+        if pred and ref:
+            pred_list += pred
+            ref_list += ref
+    pred_dict = dict(pred_list)
+    ref_dict = dict(ref_list)
+    for qid, ans in ref_dict.items():
+        ref_dict[qid] = normalize(ref_dict[qid])
+        pred_dict[qid] = normalize(pred_dict.get(qid, [EMPTY]))
+        if not ans or ans == [EMPTY]:
+            del ref_dict[qid]
+            del pred_dict[qid]
+
+    for k, v in pred_dict.items():
+        assert len(v) == 1, \
+            'There should be only one predict answer. question_id: {}'.format(k)
+    return pred_dict, ref_dict
+
+
+def get_main_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'main'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    ref_ans = ref_result[qid]['answers']
+    if not ref_ans:
+        ref_ans = [EMPTY]
+    pred_ans = pred_result.get(qid, {}).get('answers', [])[:1]
+    if not pred_ans:
+        pred_ans = [EMPTY]
+
+    return [(qid, pred_ans)], [(qid, ref_ans)]
+
+
+def get_entity_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'entity'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    if ref_result[qid]['question_type'] != 'ENTITY':
+        return None, None
+    return get_main_result(qid, pred_result, ref_result)
+
+
+def get_desc_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'description'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    if ref_result[qid]['question_type'] != 'DESCRIPTION':
+        return None, None
+    return get_main_result(qid, pred_result, ref_result)
+
+
+def get_yesno_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'yesno'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+
+    def _uniq(li, is_ref):
+        uniq_li = []
+        left = []
+        keys = set()
+        for k, v in li:
+            if k not in keys:
+                uniq_li.append((k, v))
+                keys.add(k)
+            else:
+                left.append((k, v))
+
+        if is_ref:
+            dict_li = dict(uniq_li)
+            for k, v in left:
+                dict_li[k] += v
+            uniq_li = [(k, v) for k, v in dict_li.items()]
+        return uniq_li
+
+    def _expand_result(uniq_li):
+        expanded = uniq_li[:]
+        keys = set([x[0] for x in uniq_li])
+        for k in YESNO_LABELS - keys:
+            expanded.append((k, [EMPTY]))
+        return expanded
+
+    def _get_yesno_ans(qid, result_dict, is_ref=False):
+        if qid not in result_dict:
+            return [(str(qid) + '_' + k, v) for k, v in _expand_result([])]
+        yesno_answers = result_dict[qid]['yesno_answers']
+        answers = result_dict[qid]['answers']
+        lbl_ans = _uniq([(k, [v]) for k, v in zip(yesno_answers, answers)],
+                        is_ref)
+        ret = [(str(qid) + '_' + k, v) for k, v in _expand_result(lbl_ans)]
+        return ret
+
+    if ref_result[qid]['question_type'] != 'YES_NO':
+        return None, None
+
+    ref_ans = _get_yesno_ans(qid, ref_result, is_ref=True)
+    pred_ans = _get_yesno_ans(qid, pred_result)
+    return pred_ans, ref_ans
+
+
+def get_all_result(qid, pred_result, ref_result):
+    """
+    Prepare answers for task 'all'.
+
+    Args:
+        qid: question_id.
+        pred_result: A dict include all question_id's result information read
+                     from args.pred_file.
+        ref_result: A dict incluce all question_id's result information read
+                    from args.ref_file.
+    Returns:
+        Two lists, the first one contains predict result, the second
+        one contains reference result of the same question_id. Each list has
+        elements of tuple (question_id, answers), 'answers' is a list of strings.
+    """
+    if ref_result[qid]['question_type'] == 'YES_NO':
+        return get_yesno_result(qid, pred_result, ref_result)
+    return get_main_result(qid, pred_result, ref_result)
+
+
+def format_metrics(metrics, task, err_msg):
+    """
+    Format metrics. 'err' field returns any error occured during evaluation.
+
+    Args:
+        metrics: A dict object contains metrics for different tasks.
+        task: Task name.
+        err_msg: Exception raised during evaluation.
+    Returns:
+        Formatted result.
+    """
+    result = {}
+    sources = ['both', 'search', 'zhidao']
+    if err_msg is not None:
+        return {'errorMsg': str(err_msg), 'errorCode': 1, 'data': []}
+    data = []
+    if task != 'all' and task != 'main':
+        sources = ['both']
+
+    if task == 'entity':
+        metric_names = ['Bleu-4', 'Rouge-L']
+        metric_names_prf = ['F1', 'Precision', 'Recall']
+        for name in metric_names + metric_names_prf:
+            for src in sources:
+                obj = {
+                    'name': name,
+                    'value': round(metrics[src].get(name, 0) * 100, 2),
+                    'type': src,
+                }
+                data.append(obj)
+    elif task == 'yesno':
+        metric_names = ['Bleu-4', 'Rouge-L']
+        details = ['Yes', 'No', 'Depends']
+        src = sources[0]
+        for name in metric_names:
+            obj = {
+                'name': name,
+                'value': round(metrics[src].get(name, 0) * 100, 2),
+                'type': 'All',
+            }
+            data.append(obj)
+            for d in details:
+                obj = {
+                    'name': name,
+                    'value': round(metrics[src].get(d + '|' + name, 0) * 100,
+                                   2),
+                    'type': d
+                }
+                data.append(obj)
+    else:
+        metric_names = ['Bleu-4', 'Rouge-L']
+        for name in metric_names:
+            for src in sources:
+                obj = {
+                    'name': name,
+                    'value': round(metrics[src].get(name, 0) * 100, 2),
+                    'type': src
+                }
+                data.append(obj)
+
+    result['data'] = data
+    result['errorCode'] = 0
+    result['errorMsg'] = 'success'
+
+    return result
+
+
+def main(args):
+    """
+    Do evaluation.
+    """
+    err = None
+    metrics = {}
+    try:
+        pred_result = read_file(args.pred_file, args.task)
+        ref_result = read_file(args.ref_file, args.task, is_ref=True)
+        sources = ['both', 'search', 'zhidao']
+        if args.task not in set(['main', 'all']):
+            sources = sources[:1]
+        for source in sources:
+            metrics[source] = get_metrics(pred_result, ref_result, args.task,
+                                          source)
+    except ValueError as ve:
+        err = ve
+    except AssertionError as ae:
+        err = ae
+
+    print(
+        json.dumps(
+            format_metrics(metrics, args.task, err),
+            ensure_ascii=False).encode('utf8'))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('pred_file', help='predict file')
+    parser.add_argument('ref_file', help='reference file')
+    parser.add_argument(
+        'task', help='task name: Main|Yes_No|All|Entity|Description')
+
+    args = parser.parse_args()
+    args.task = args.task.lower().replace('_', '')
+    main(args)
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
new file mode 100644
index 00000000..c2121cfd
--- /dev/null
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -0,0 +1,1332 @@
+import codecs
+import copy
+import math
+import os
+import subprocess
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import xavier_uniform_
+from transformers import (BertConfig, BertModel, BertTokenizer, RobertaConfig,
+                          RobertaModel, RobertaTokenizer)
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.outputs import OutputKeys
+from modelscope.utils import logger as logging
+from .configuration_palm import PalmConfig
+from .dureader_eval import compute_bleu_rouge, normalize
+
+CONFIG_NAME = 'config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+
+
+class MultiHeadedAttention(nn.Module):  # SelfAttention
+    """
+    Multi-Head Attention module from
+    "Attention is All You Need"
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
+
+    Similar to standard `dot` attention but uses
+    multiple attention distributions simulataneously
+    to select relevant items.
+
+    .. mermaid::
+
+       graph BT
+          A[key]
+          B[value]
+          C[query]
+          O[output]
+          subgraph Attn
+            D[Attn 1]
+            E[Attn 2]
+            F[Attn N]
+          end
+          A --> D
+          C --> D
+          A --> E
+          C --> E
+          A --> F
+          C --> F
+          D --> O
+          E --> O
+          F --> O
+          B --> O
+
+    Also includes several additional tricks.
+
+    Args:
+       head_count (int): number of parallel heads
+       model_dim (int): the dimension of keys/values/queries,
+           must be divisible by head_count
+       dropout (float): dropout parameter
+    """
+
+    def __init__(self,
+                 head_count,
+                 model_dim,
+                 dropout=0.1,
+                 use_final_linear=True):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super().__init__()
+        self.head_count = head_count
+
+        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_values = nn.Linear(model_dim,
+                                       head_count * self.dim_per_head)
+        self.linear_query = nn.Linear(model_dim,
+                                      head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.use_final_linear = use_final_linear
+        if (self.use_final_linear):
+            self.final_linear = nn.Linear(model_dim, model_dim)
+
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                layer_cache=None,
+                type=None,
+                predefined_graph_1=None,
+                return_attn=False):
+        """
+        Compute the context vector and the attention vectors.
+
+        Args:
+           key (`FloatTensor`): set of `key_len`
+                key vectors `[batch, key_len, dim]`
+           value (`FloatTensor`): set of `key_len`
+                value vectors `[batch, key_len, dim]`
+           query (`FloatTensor`): set of `query_len`
+                 query vectors  `[batch, query_len, dim]`
+           mask: binary mask indicating which keys have
+                 non-zero attention `[batch, query_len, key_len]`
+        Returns:
+           (`FloatTensor`, `FloatTensor`) :
+
+           * output context vectors `[batch, query_len, dim]`
+           * one of the attention vectors `[batch, query_len, key_len]`
+        """
+
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head) \
+                .transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous() \
+                .view(batch_size, -1, head_count * dim_per_head)
+
+        # 1) Project key, value, and query.
+        if layer_cache is not None:
+            if type == 'self':
+                query, key, value = self.linear_query(query), self.linear_keys(
+                    query), self.linear_values(query)
+
+                key = shape(key)
+                value = shape(value)
+
+                if layer_cache is not None:
+                    device = key.device
+                    if layer_cache['self_keys'] is not None:
+                        key = torch.cat(
+                            (layer_cache['self_keys'].to(device), key), dim=2)
+                    if layer_cache['self_values'] is not None:
+                        value = torch.cat(
+                            (layer_cache['self_values'].to(device), value),
+                            dim=2)
+                    layer_cache['self_keys'] = key
+                    layer_cache['self_values'] = value
+            elif type == 'context':
+                query = self.linear_query(query)
+                if layer_cache is not None:
+                    if layer_cache['memory_keys'] is None:
+                        key, value = self.linear_keys(key), self.linear_values(
+                            value)
+                        key = shape(key)
+                        value = shape(value)
+                    else:
+                        key, value = layer_cache['memory_keys'], layer_cache[
+                            'memory_values']
+                    layer_cache['memory_keys'] = key
+                    layer_cache['memory_values'] = value
+                else:
+                    key, value = self.linear_keys(key), self.linear_values(
+                        value)
+                    key = shape(key)
+                    value = shape(value)
+        else:
+            key = self.linear_keys(key)
+            value = self.linear_values(value)
+            query = self.linear_query(query)
+            key = shape(key)
+            value = shape(value)
+
+        query = shape(query)
+
+        # 2) Calculate and scale scores.
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e18)
+
+        # 3) Apply attention dropout and compute context vectors.
+
+        attn = self.softmax(scores)
+
+        if predefined_graph_1 is not None:
+            attn_masked = attn[:, -1] * predefined_graph_1
+            attn_masked = attn_masked / (
+                torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
+
+            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
+
+        drop_attn = self.dropout(attn)
+        if self.use_final_linear:
+            context = unshape(torch.matmul(drop_attn, value))
+            output = self.final_linear(context)
+            if return_attn:
+                return output, attn
+            else:
+                return output
+        else:
+            context = torch.matmul(drop_attn, value)
+            if return_attn:
+                return context, attn
+            else:
+                return context
+
+
+class PositionwiseFeedForward(nn.Module):  # Output
+    """ A two-layer Feed-Forward-Network with residual layer norm.
+
+    Args:
+        d_model (int): the size of input for the first-layer of the FFN.
+        d_ff (int): the hidden layer size of the second-layer
+            of the FNN.
+        dropout (float): dropout probability in :math:`[0, 1)`.
+    """
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.actv = ACT2FN['gelu_new']
+        self.dropout_1 = nn.Dropout(dropout)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+class TransformerDecoderLayer(nn.Module):  # Layer
+    """
+    Args:
+      d_model (int): the dimension of keys/values/queries in
+                       MultiHeadedAttention, also the input size of
+                       the first-layer of the PositionwiseFeedForward.
+      heads (int): the number of heads for MultiHeadedAttention.
+      d_ff (int): the second-layer of the PositionwiseFeedForward.
+      dropout (float): dropout probability(0-1.0).
+      self_attn_type (string): type of self-attention scaled-dot, average
+    """
+    MAX_SIZE = 5000
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super().__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+
+        self.context_attn = MultiHeadedAttention(
+            heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        mask = self._get_attn_subsequent_mask(self.MAX_SIZE)
+        # Register self.mask as a buffer in TransformerDecoderLayer, so
+        # it gets TransformerDecoderLayer's cuda behavior automatically.
+        self.register_buffer('mask', mask)
+
+    def forward(self,
+                inputs,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                previous_input=None,
+                layer_cache=None,
+                step=None):
+        """
+        Args:
+            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
+            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
+            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
+            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
+
+        Returns:
+            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
+
+            * output `[batch_size x 1 x model_dim]`
+            * attn `[batch_size x 1 x src_len]`
+            * all_input `[batch_size x current_step x model_dim]`
+
+        """
+        dec_mask = torch.gt(
+            tgt_pad_mask.type(torch.uint8)
+            + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)].type(
+                torch.uint8), 0)
+        input_norm = self.layer_norm_1(inputs)
+        all_input = input_norm
+        if previous_input is not None:
+            all_input = torch.cat((previous_input, input_norm), dim=1)
+            dec_mask = None
+
+        query = self.self_attn(
+            all_input,
+            all_input,
+            input_norm,
+            mask=dec_mask,
+            layer_cache=layer_cache,
+            type='self')
+
+        query = self.drop(query) + inputs
+
+        query_norm = self.layer_norm_2(query)
+        mid, attn = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            type='context',
+            return_attn=True)
+        output = self.feed_forward(self.drop(mid) + query)
+
+        return output, attn, all_input
+
+    def _get_attn_subsequent_mask(self, size):
+        """
+        Get an attention mask to avoid using the subsequent info.
+
+        Args:
+            size: int
+
+        Returns:
+            (`LongTensor`):
+
+            * subsequent_mask `[1 x size x size]`
+        """
+        attn_shape = (1, size, size)
+        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
+        subsequent_mask = torch.from_numpy(subsequent_mask)
+        return subsequent_mask
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, dropout, dim, max_len=5000):
+        super().__init__()
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float)
+                              * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+        self.dropout = nn.Dropout(dropout)
+        self.dim = dim
+
+    def forward(self, emb, step=None):
+        emb = emb * math.sqrt(self.dim)
+        if (step):
+            emb = emb + self.pe[:, step][:, None, :]
+
+        else:
+            emb = emb + self.pe[:, :emb.size(1)]
+        emb = self.dropout(emb)
+        return emb
+
+    def get_emb(self, emb):
+        return self.pe[:, :emb.size(1)]
+
+
+class TransformerDecoder(nn.Module):  # Decoder
+    """
+    The Transformer decoder from "Attention is All You Need".
+
+
+    .. mermaid::
+
+       graph BT
+          A[input]
+          B[multi-head self-attn]
+          BB[multi-head src-attn]
+          C[feed forward]
+          O[output]
+          A --> B
+          B --> BB
+          BB --> C
+          C --> O
+
+
+    Args:
+       num_layers (int): number of encoder layers.
+       d_model (int): size of the model
+       heads (int): number of heads
+       d_ff (int): size of the inner FF layer
+       dropout (float): dropout parameters
+       embeddings (:obj:`onmt.modules.Embeddings`):
+          embeddings to use, should have positional encodings
+       attn_type (str): if using a seperate copy attention
+    """
+    decoder_type = 'transformer'
+
+    class TransformerDecoderState:
+
+        def __init__(self, src):
+            self.src = src
+            self.previous_input = None
+            self.previous_layer_inputs = None
+            self.cache = None
+
+        def update_state(self, new_input, previous_layer_inputs):
+            self.previous_input = new_input
+            self.previous_layer_inputs = previous_layer_inputs
+            self.cache = None
+
+        def _init_cache(self, num_layers):
+            self.cache = {}
+            for num in range(num_layers):
+                layer_cache = {
+                    'memory_keys': None,
+                    'memory_values': None,
+                    'self_keys': None,
+                    'self_values': None
+                }
+                self.cache['layer_{}'.format(num)] = layer_cache
+
+        def map_batch_fn(self, fn):
+
+            def _recursive_map(struct, batch_dim=0):
+                for k, v in struct.items():
+                    if v is not None:
+                        if isinstance(v, dict):
+                            _recursive_map(v)
+                        else:
+                            struct[k] = fn(v, batch_dim)
+
+            self.src = fn(self.src, 0)
+            if self.cache is not None:
+                _recursive_map(self.cache)
+
+    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings):
+        super().__init__()
+
+        # Basic attributes.
+        self.num_layers = num_layers
+        self.embeddings = embeddings
+        self.pos_emb = PositionalEncoding(dropout,
+                                          self.embeddings.embedding_dim)
+
+        # Build TransformerDecoder.
+        self.transformer_layers = nn.ModuleList([
+            TransformerDecoderLayer(d_model, heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.state = None
+
+    def init_state(self, src, with_cache=False):
+        self.state = self.TransformerDecoderState(src)
+        if with_cache:
+            self.state._init_cache(self.num_layers)
+
+    def forward(self, tgt, memory_bank, step=None, memory_masks=None):
+        src_words = self.state.src
+        tgt_words = tgt
+        src_batch, src_len = src_words.size()
+        tgt_batch, tgt_len = tgt_words.size()
+
+        # Run the forward pass of the TransformerDecoder.
+        # emb = self.embeddings(tgt, step=step)
+        emb = self.embeddings(tgt)
+        assert emb.dim() == 3  # len x batch x embedding_dim
+        output = self.pos_emb(emb, step)
+
+        src_memory_bank = memory_bank
+        padding_idx = self.embeddings.padding_idx
+        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1) \
+            .expand(tgt_batch, tgt_len, tgt_len)
+
+        if memory_masks is not None:
+            src_len = memory_masks.size(-1)
+            src_pad_mask = memory_masks.expand(src_batch, tgt_len, src_len)
+        else:
+            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1) \
+                .expand(src_batch, tgt_len, src_len)
+
+        if self.state.cache is None:
+            saved_inputs = []
+        attns = []
+        for i in range(self.num_layers):
+            prev_layer_input = None
+            if self.state.cache is None:
+                if self.state.previous_input is not None:
+                    prev_layer_input = self.state.previous_layer_inputs[i]
+            output, attn, all_input \
+                = self.transformer_layers[i](output, src_memory_bank, src_pad_mask, tgt_pad_mask,
+                                             previous_input=prev_layer_input,
+                                             layer_cache=self.state.cache['layer_{}'.format(i)]
+                                             if self.state.cache is not None else None, step=step)
+            if self.state.cache is None:
+                saved_inputs.append(all_input)
+            attns.append(attn)
+
+        if self.state.cache is None:
+            saved_inputs = torch.stack(saved_inputs)
+
+        output = self.layer_norm(output)
+
+        # Process the result and update the attentions.
+        if self.state.cache is None:
+            self.state.update_state(tgt, saved_inputs)
+
+        return output, attns
+
+
+class PalmPointerGenerator(nn.Module):
+
+    def __init__(self, hidden_size, vocab_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, vocab_size)
+        self.gen_func = nn.LogSoftmax(-1)
+
+    def forward(self, x):
+        x = self.dense(x)
+        x = self.gen_func(x)
+        return x
+
+
+class PalmPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PalmConfig
+    base_model_prefix = 'palm'
+
+    @classmethod
+    def from_pretrained(
+            cls, pretrained_model_name_or_path: Optional[Union[str,
+                                                               os.PathLike]],
+            **kwargs):
+        config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        config = PalmConfig.from_json_file(config_file) if os.path.isfile(
+            config_file) else PalmConfig()
+        config.encoder_pth = os.path.join(pretrained_model_name_or_path,
+                                          config.encoder_pth)
+        checkpoint_file = os.path.join(pretrained_model_name_or_path,
+                                       WEIGHTS_NAME)
+        checkpoint = torch.load(checkpoint_file) if os.path.isfile(
+            checkpoint_file) else None
+        return cls(config, checkpoint, **kwargs)
+
+
+class AbsSummarizer(PalmPreTrainedModel):  # Model
+
+    def __init__(self, config, checkpoint=None):
+        super().__init__(config)
+        self.config = config
+        if config.encoder == 'bert' or config.encoder == 'zh_bert':
+            self.bert = BertModel(
+                BertConfig.from_pretrained(config.encoder_pth))
+        elif config.encoder == 'roberta':
+            self.bert = RobertaModel(
+                RobertaConfig.from_pretrained(config.encoder_pth))
+
+        if (config.max_pos > 512):
+            my_pos_embeddings = nn.Embedding(
+                config.max_pos, self.bert.model.config.hidden_size)
+            my_pos_embeddings.weight.data[:512] = \
+                self.bert.embeddings.position_embeddings.weight.data
+            my_pos_embeddings.weight.data[512:] = \
+                self.bert.embeddings.position_embeddings.weight.data[-1][None, :].repeat(config.max_pos - 512, 1)
+            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
+        self.vocab_size = self.bert.config.vocab_size
+        tgt_embeddings = nn.Embedding(
+            self.vocab_size,
+            self.bert.config.hidden_size,
+            padding_idx=1 if config.encoder == 'roberta' else 0)
+
+        if config.share_emb:
+            tgt_embeddings.weight = copy.deepcopy(
+                self.bert.model.embeddings.word_embeddings.weight)
+        self.decoder = TransformerDecoder(
+            config.dec_layers,
+            config.dec_hidden_size,
+            heads=config.dec_heads,
+            d_ff=config.dec_ff_size,
+            dropout=config.dec_dropout,
+            embeddings=tgt_embeddings)
+        self.generator = PalmPointerGenerator(config.dec_hidden_size,
+                                              self.vocab_size)
+        self.generator.dense.weight = self.decoder.embeddings.weight
+
+        if checkpoint is not None:
+            for key in list(checkpoint['model'].keys()):
+                checkpoint['model'][key.replace('module.',
+                                                '')] = checkpoint['model'][key]
+            msg = self.load_state_dict(checkpoint['model'], strict=False)
+            print(msg)
+        else:
+            for module in self.decoder.modules():
+                if isinstance(module, (nn.Linear, nn.Embedding)):
+                    module.weight.data.normal_(mean=0.0, std=0.02)
+                elif isinstance(module, nn.LayerNorm):
+                    module.bias.data.zero_()
+                    module.weight.data.fill_(1.0)
+                if isinstance(module, nn.Linear) and module.bias is not None:
+                    module.bias.data.zero_()
+            for p in self.generator.parameters():
+                if p.dim() > 1:
+                    xavier_uniform_(p)
+                else:
+                    p.data.zero_()
+            if config.use_bert_emb:
+                if config.encoder == 'roberta':
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=1)
+                else:
+                    tgt_embeddings = nn.Embedding(
+                        self.vocab_size,
+                        self.bert.config.hidden_size,
+                        padding_idx=0)
+                tgt_embeddings.weight = copy.deepcopy(
+                    self.bert.embeddings.word_embeddings.weight)
+                self.decoder.embeddings = tgt_embeddings
+            self.generator.dense.weight = self.decoder.embeddings.weight
+
+    def forward(self, src, tgt, mask_src):
+        top_vec, _ = self.bert(src, mask_src, return_dict=False)
+        self.decoder.init_state(src)
+        decoder_outputs, attns = self.decoder(tgt[:, :-1], top_vec)
+        return decoder_outputs, attns[-1], top_vec
+
+
+class LabelSmoothingLoss(nn.Module):
+    """
+    With label smoothing,
+    KL-divergence between q_{smoothed ground truth prob.}(w)
+    and p_{prob. computed by model}(w) is minimized.
+    """
+
+    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
+        assert 0.0 < label_smoothing <= 1.0
+        self.padding_idx = ignore_index
+        super(LabelSmoothingLoss, self).__init__()
+
+        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
+        one_hot = torch.full((tgt_vocab_size, ), smoothing_value)
+        one_hot[self.padding_idx] = 0
+        self.register_buffer('one_hot', one_hot.unsqueeze(0))
+        self.confidence = 1.0 - label_smoothing
+
+    def forward(self, output, target):
+        """
+        output (FloatTensor): batch_size x n_classes
+        target (LongTensor): batch_size
+        """
+        model_prob = self.one_hot.repeat(target.size(0), 1)
+        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
+        model_prob.masked_fill_((target == self.padding_idx).unsqueeze(1), 0)
+
+        return F.kl_div(output, model_prob, reduction='sum')
+
+
+class NMTLossCompute(nn.Module):
+    """
+    Standard NMT Loss Computation.
+    """
+
+    def __init__(self, generator, symbols, vocab_size, label_smoothing=0.0):
+        super().__init__()
+        self.generator = generator
+        self.padding_idx = symbols['PAD']
+        if label_smoothing > 0:
+            self.criterion = LabelSmoothingLoss(
+                label_smoothing, vocab_size, ignore_index=self.padding_idx)
+        else:
+            self.criterion = nn.NLLLoss(
+                ignore_index=self.padding_idx, reduction='sum')
+
+    def _bottle(self, _v):
+        return _v.view(-1, _v.size(2))
+
+    def _unbottle(self, _v, batch_size):
+        return _v.view(-1, batch_size, _v.size(1))
+
+    def forward(self, tgt, output):
+        target = tgt[:, 1:]
+        normalization = target.ne(self.padding_idx).sum()
+        bottled_output = self._bottle(output)
+        scores = self.generator(bottled_output)
+        gtruth = target.contiguous().view(-1)
+        loss = self.criterion(scores, gtruth)
+        loss.div(float(normalization))
+        return loss
+
+
+class PalmForConditionalGeneration(PalmPreTrainedModel):
+
+    def __init__(self, config, checkpoint=None):
+        super().__init__(config)
+        self.config = config
+        if config.encoder == 'roberta':
+            tokenizer = RobertaTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=False)
+            symbols = {
+                'BOS': tokenizer.cls_token_id,
+                'EOS': tokenizer.sep_token_id,
+                'PAD': tokenizer.pad_token_id,
+                'EOQ': tokenizer.unk_token_id
+            }
+        elif config.encoder == 'bert' or config.encoder == 'zh_bert':
+            tokenizer = BertTokenizer.from_pretrained(
+                config.encoder_pth, do_lower_case=True)
+            symbols = {
+                'BOS': tokenizer.vocab['[CLS]'],
+                'EOS': tokenizer.vocab['[SEP]'],
+                'PAD': tokenizer.vocab['[PAD]'],
+                'EOQ': tokenizer.vocab['[unused2]']
+            }
+        self.tokenizer = tokenizer
+        self.symbols = symbols
+        self.palm = AbsSummarizer(config, checkpoint)
+        self.loss = NMTLossCompute(self.palm.generator, symbols,
+                                   self.palm.vocab_size,
+                                   config.label_smoothing)
+
+    def forward(self, src, tgt, mask_src):
+        output = self.palm(src, tgt, mask_src)[0]
+        loss = self.loss(tgt, output)
+        return loss
+
+
+class Translator(nn.Module):
+    """
+    Uses a model to translate a batch of sentences.
+    """
+
+    @dataclass
+    class Batch:
+        batch_size: int
+        src: torch.Tensor
+        tgt: torch.Tensor
+        mask_src: torch.Tensor
+        query_id: List[None] = None
+        src_str: List[List[str]] = None
+        tgt_str: List[str] = None
+
+    def __init__(self,
+                 model: PalmForConditionalGeneration,
+                 dataset: str = 'cnn'):
+        super().__init__()
+        self.logger = logging.get_logger(__name__)
+        self.args = model.config
+        self.args.dataset = dataset
+        self.model = model.palm
+        self.generator = self.model.generator
+        self.vocab = model.tokenizer
+        self.symbols = model.symbols
+        self.start_token = self.symbols['BOS']
+        self.end_token = self.symbols['EOS']
+        self.alpha = self.args.alpha
+        self.beam_size = self.args.beam_size
+        self.min_length = self.args.min_length
+        self.max_length = self.args.max_length
+
+    def from_batch(self, translation_batch):
+        batch = translation_batch['batch']
+        assert (len(translation_batch['gold_score']) == len(
+            translation_batch['predictions']))
+        batch_size = batch.batch_size
+
+        preds, pred_score, _, tgt_str, src, src_str = \
+            translation_batch['predictions'], translation_batch['scores'], translation_batch['gold_score'], \
+            batch.tgt_str, batch.src, batch.src_str
+        query_id = batch.query_id
+        '''
+        try:
+            query_id = batch.query_id
+        except:
+            query_id = None
+        '''
+        translations = []
+        for b in range(batch_size):
+            if self.args.dataset == 'qg_ranking_test':
+                if self.args.encoder == 'bert' or self.args.encoder == 'zh_bert':
+                    pred_sents = [
+                        ' '.join(
+                            self.vocab.convert_ids_to_tokens(
+                                [int(n) for n in each])).replace(' ##', '')
+                        for each in preds[b]
+                    ]
+                elif self.args.encoder == 'roberta':
+                    pred_sents = [
+                        self.vocab.decode([int(n) for n in each
+                                           ]).replace('<s>',
+                                                      '').replace('</s>', '')
+                        for each in preds[b]
+                    ]
+            elif self.args.encoder == 'roberta':
+                pred_sents = self.vocab.decode([int(n)
+                                                for n in preds[b][0]]).replace(
+                                                    '<s>',
+                                                    '').replace('</s>', '')
+            elif self.args.encoder == 'bert':
+                pred_sents = self.vocab.convert_ids_to_tokens(
+                    [int(n) for n in preds[b][0]])
+                pred_sents = ' '.join(pred_sents).replace(' ##', '')
+            elif self.args.encoder == 'zh_bert' and self.args.dataset == 'paraphrase':
+                pred_sents = [
+                    self.vocab.convert_ids_to_tokens([int(n) for n in pred])
+                    for pred in preds[b]
+                ]
+                pred_sents = [
+                    ''.join(pred).replace(' ##', '') for pred in pred_sents
+                ]
+            elif self.args.encoder == 'zh_bert':
+                pred_sents = self.vocab.convert_ids_to_tokens(
+                    [int(n) for n in preds[b][0]])
+                pred_sents = ''.join(pred_sents).replace('##', '')
+            gold_sent = tgt_str[b]
+
+            if self.args.encoder == 'roberta':
+                raw_src = self.vocab.decode([int(t) for t in src[b]])
+                raw_src = ' '.join(src_str[b])
+            else:
+                raw_src = [self.vocab.ids_to_tokens[int(t)]
+                           for t in src[b]][:500]
+                raw_src = ' '.join(raw_src)
+            if self.args.dataset == 'faq':
+                translation = (pred_sents, gold_sent, src_str[b], query_id[b],
+                               pred_score[b])
+            else:
+                translation = (pred_sents, gold_sent, raw_src, query_id[b],
+                               pred_score[b])
+            # translation = (pred_sents[0], gold_sent)
+            translations.append(translation)
+
+        return translations
+
+    def translate(self, data_iter, step):
+        gold_path = self.args.result_path + '.%d.gold' % step
+        can_path = self.args.result_path + '.%d.candidate' % step
+        self.gold_out_file = codecs.open(gold_path, 'w', 'utf-8')
+        self.can_out_file = codecs.open(can_path, 'w', 'utf-8')
+        self.pred_json_score_out_file = codecs.open(can_path + '.sample', 'w',
+                                                    'utf-8')
+        if self.args.dataset == 'paraphrase' and self.args.encoder == 'roberta':
+            out = '\t'.join([
+                'query_id', 'source_query', 'target_query', 'predict_query'
+            ]) + '\n'
+            self.pred_json_score_out_file.write(out)
+
+        raw_src_path = self.args.result_path + '.%d.raw_src' % step
+        self.src_out_file = codecs.open(raw_src_path, 'w', 'utf-8')
+
+        pred_results, gold_results = [], []
+        cnt = 0
+        pred_dict, ref_dict = {}, {}
+        for i, batch in enumerate(data_iter):
+            self.logger.info(f'data: {i + 1} / {len(data_iter)}')
+            batch_data = self.translate_batch(batch)
+            translations = self.from_batch(batch_data)
+
+            for trans in translations:
+                pred, gold, src, query_id, pred_score = trans
+                src = src.replace('<pad>', '').replace('##', '').strip()
+                if self.args.dataset == 'qg_ranking_test':
+                    pred_str = '\t'.join([
+                        each.replace('[unused0]', '').replace(
+                            '[PAD]', '').replace('[unused1]', '').replace(
+                                r' +', ' ').replace('[SEP]', '').replace(
+                                    '[unused2]',
+                                    '').replace(r' +', ' ').replace(
+                                        '<mask>',
+                                        '<q>').replace('<pad>', '').replace(
+                                            '<s>',
+                                            '').replace('</s>', '').replace(
+                                                '<unk>', ' ').strip()
+                        for each in pred
+                    ])
+                else:
+                    pred_str = pred.replace('[unused0]', '').replace(
+                        '[PAD]', '').replace('[unused1]', '').replace(
+                            r' +', ' ').replace('[SEP]', '').replace(
+                                '[unused2]', '').replace('[CLS]', '').replace(
+                                    '[SEP]', '').replace('[UNK]', '').strip()
+                    pred_str = pred_str.replace(r' +', ' ').replace(
+                        '<mask>',
+                        '<q>').replace('<pad>', '').replace('<s>', '').replace(
+                            '</s>', '').replace('<unk>', ' ').strip()
+                gold_str = gold.replace('<mask>', '<q>').strip().replace(
+                    '[UNK]', '').replace('[unused1]', '').replace(
+                        '[unused2]',
+                        '').replace('##', '').replace('[CLS]', '').replace(
+                            '[SEP]', '').strip().replace('<s>', '').replace(
+                                '</s>', '').replace('<unk>', ' ').strip()
+                if (self.args.recall_eval):
+                    _pred_str = ''
+                    # gap = 1e3
+                    for sent in pred_str.split('<q>'):
+                        can_pred_str = _pred_str + '<q>' + sent.strip()
+                        # can_gap = math.fabs(len(_pred_str.split()) - len(gold_str.split()))
+                        # if(can_gap>=gap):
+                        if len(can_pred_str.split()) >= len(
+                                gold_str.split()) + 10:
+                            pred_str = _pred_str
+                            break
+                        else:
+                            # gap = can_gap
+                            _pred_str = can_pred_str
+
+                if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
+                    pred_str = pred_str.replace('<q>', ' ')
+                    if query_id is not None:
+                        pred_json = {
+                            'query_id': query_id,
+                            'answers': [pred_str]
+                        }
+                        gold_json = {
+                            'query_id': query_id,
+                            'answers': [gold_str]
+                        }
+                        pred_json_score = {
+                            'query_id': query_id,
+                            'answers': [pred_str],
+                            'scores': pred_score[0].cpu().numpy().tolist()
+                        }
+                    else:
+                        pred_json = {'query_id': cnt, 'answers': [pred_str]}
+                        gold_json = {'query_id': cnt, 'answers': [gold_str]}
+                        pred_json_score = {
+                            'query_id': cnt,
+                            'answers': [pred_str],
+                            'scores': pred_score[0].cpu().numpy().tolist()
+                        }
+                    json.dump(pred_json, self.can_out_file)
+                    self.can_out_file.write('\n')
+                    json.dump(gold_json, self.gold_out_file)
+                    self.gold_out_file.write('\n')
+                    json.dump(pred_json_score, self.pred_json_score_out_file)
+                    self.pred_json_score_out_file.write('\n')
+                    self.src_out_file.write(src.strip() + '\n')
+                elif self.args.dataset == 'cnn':
+                    self.can_out_file.write(pred_str + '\n')
+                    self.gold_out_file.write(gold_str + '\n')
+                    self.src_out_file.write(src.strip() + '\n')
+                elif self.args.dataset == 'dureader':
+                    if query_id is None:
+                        query_id = str(cnt)
+                    pred_results.extend(normalize([pred_str]))
+                    gold_results.extend(normalize([gold_str]))
+                    self.can_out_file.write(pred_str + '\n')
+                    self.gold_out_file.write('\t'.join([src[0], gold_str])
+                                             + '\n')
+
+                elif self.args.dataset == 'paraphrase':
+                    if query_id is None:
+                        query_id = str(cnt)
+                    if self.args.encoder == 'roberta':
+                        pred_str = [pred_str]
+                    pred_dict[query_id] = normalize([pred_str[0]])
+                    ref_dict[query_id] = normalize([gold_str])
+                    # pred_str_list = [src] + pred_str
+                    # self.can_out_file.write("\t".join(pred_str_list)+"\n")
+                    # self.can_out_file.write("\t".join(pred_str_list)+"\n")
+                    # self.gold_out_file.write("\t".join([src, pred_str[0], gold_str])+"\n")
+                    self.pred_json_score_out_file.write(
+                        '\t'.join([str(query_id), src, gold_str, pred_str[0]])
+                        + '\n')
+                elif self.args.dataset == 'faq':
+                    if pred_score[0].cpu().numpy().tolist() < -3.5:
+                        continue
+                    self.can_out_file.write(
+                        '\t'.join([str(query_id), src, pred_str]) + '\n')
+                    self.gold_out_file.write(
+                        '\t'.join([str(query_id), src, gold_str]) + '\n')
+                    # passage, answer, question, score
+                    self.pred_json_score_out_file.write('\t'.join([
+                        str(query_id), gold_str, src, pred_str,
+                        str(pred_score[0].cpu().numpy().tolist())
+                    ]) + '\n')
+                elif self.args.dataset == 'qg_ranking_test':
+                    self.can_out_file.write(
+                        str(query_id) + '\t' + pred_str + '\n')
+
+                cnt += 1
+            self.can_out_file.flush()
+            self.gold_out_file.flush()
+            self.src_out_file.flush()
+        self.logger.info('cnt: %s' % cnt)
+        self.can_out_file.close()
+        self.gold_out_file.close()
+        self.src_out_file.close()
+
+        if (step != -1):
+            if self.args.dataset == 'marco' or self.args.dataset == 'squad' or self.args.dataset == 'qg_ranking':
+                cnn_results = subprocess.getoutput(
+                    './run.sh %s %s' % (gold_path, can_path))  # run.sh ...
+                self.logger.info(cnn_results)
+            elif self.args.dataset == 'cnn':
+                self.logger.info('Calculating Rouge')
+                from rouge import Rouge
+                candidates = [
+                    line.strip() for line in open(can_path, encoding='utf-8')
+                ]
+                references = [
+                    line.strip() for line in open(gold_path, encoding='utf-8')
+                ]
+                rouge_score = Rouge().get_scores(
+                    candidates, references, avg=True)
+                # self.logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges)))
+                print(rouge_score)
+            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
+
+                def postprocess_text(preds, labels):
+                    preds = [pred.strip().replace('.', '') for pred in preds]
+                    labels = [label.strip() for label in labels]
+                    while '' in preds:
+                        idx = preds.index('')
+                        preds[idx] = '。'
+                    return preds, labels
+
+                # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
+                # self.logger.info('Dev eval result: {}'.format(bleu_rouge))
+                pred_results, gold_results = postprocess_text(
+                    pred_results, gold_results)
+                pred_dict = {str(i): tmp for i, tmp in enumerate(pred_results)}
+                gold_dict = {str(i): tmp for i, tmp in enumerate(gold_results)}
+                bleu_rouge = compute_bleu_rouge(pred_dict, gold_dict)
+                print(bleu_rouge)
+            # unreachable
+            elif self.args.dataset == 'dureader' or self.args.dataset == 'paraphrase':
+                # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
+                # self.logger.info('Dev eval result: {}'.format(bleu_rouge))
+                pred_results, gold_results = postprocess_text(
+                    pred_results, gold_results)
+                bleu_score = cal_bleu(pred_results, gold_results)
+                from rouge import Rouge
+                rouge = Rouge()
+                rouge_score = rouge.get_scores(
+                    pred_results, gold_results, avg=True)
+                print("'Dev eval result: Bleu-4={}, {}".format(
+                    bleu_score, rouge_score))
+
+    def translate_batch(self, batch: 'Batch', fast: bool = False):
+        """
+        Translate a batch of sentences.
+
+        Mostly a wrapper around :obj:`Beam`.
+
+        Args:
+           batch (:obj:`Batch`): a batch from a dataset object
+           data (:obj:`Dataset`): the dataset object
+           fast (bool): enables fast beam search (may not support all features)
+
+        Todo:
+           Shouldn't need the original dataset.
+        """
+        self.model.eval()
+        with torch.no_grad():
+            return self._fast_translate_batch(
+                batch, self.max_length, min_length=self.min_length)
+
+    def _tile(self, x, count, dim=0):
+        perm = list(range(len(x.size())))
+        if dim != 0:
+            perm[0], perm[dim] = perm[dim], perm[0]
+            x = x.permute(perm).contiguous()
+        out_size = list(x.size())
+        out_size[0] *= count
+        batch = x.size(0)
+        x = x.view(batch, -1) \
+            .transpose(0, 1) \
+            .repeat(count, 1) \
+            .transpose(0, 1) \
+            .contiguous() \
+            .view(*out_size)
+        if dim != 0:
+            x = x.permute(perm).contiguous()
+        return x
+
+    def _top_k_top_p_filtering(self,
+                               logits,
+                               top_k=10,
+                               top_p=1.0,
+                               filter_value=-float('Inf'),
+                               min_tokens_to_keep=1):
+        if top_k > 0:
+            top_k = min(max(top_k, min_tokens_to_keep),
+                        logits.size(-1))  # Safety check
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+                sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+
+            # scatter sorted tensors to original indexing
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, sorted_indices, sorted_indices_to_remove)
+            logits[indices_to_remove] = filter_value
+        return logits
+
+    def _fast_translate_batch(self,
+                              batch: 'Batch',
+                              max_length: int,
+                              min_length: int = 0):
+        # TODO: faster code path for beam_size == 1.
+        # TODO: support these blacklisted features.
+
+        beam_size = self.beam_size
+        batch_size = batch.batch_size
+        src = batch.src
+        mask_src = batch.mask_src
+
+        src_features, _ = self.model.bert(src, mask_src, return_dict=False)
+        self.model.decoder.init_state(src, with_cache=True)
+        device = src_features.device
+
+        # Tile states and memory beam_size times.
+        self.model.decoder.state.map_batch_fn(
+            lambda state, dim: self._tile(state, beam_size, dim=dim))
+        src_features = self._tile(src_features, beam_size, dim=0)
+        batch_offset = torch.arange(
+            batch_size, dtype=torch.long, device=device)
+        beam_offset = torch.arange(
+            0,
+            batch_size * beam_size,
+            step=beam_size,
+            dtype=torch.long,
+            device=device)
+        alive_seq = torch.full([batch_size * beam_size, 1],
+                               self.start_token,
+                               dtype=torch.long,
+                               device=device)
+
+        # Give full probability to the first beam on the first step.
+        topk_log_probs = (
+            torch.tensor(
+                [0.0] + [float('-inf')] * (beam_size - 1),
+                device=device).repeat(batch_size))
+
+        # Structure that holds finished hypotheses.
+        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
+
+        results = {}
+        results['predictions'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['scores'] = [[] for _ in range(batch_size)]  # noqa: F812
+        results['gold_score'] = [0] * batch_size
+        results['batch'] = batch
+
+        for step in range(max_length):
+            self.logger.info(f'step: {step + 1} / {max_length}')
+            decoder_input = alive_seq[:, -1].view(1, -1)
+
+            # Decoder forward.
+            decoder_input = decoder_input.transpose(0, 1)
+            dec_out, attns = self.model.decoder(
+                decoder_input, src_features, step=step)
+
+            # Generator forward.
+            log_probs = self.generator.forward(
+                dec_out.transpose(0, 1).squeeze(0))
+            vocab_size = log_probs.size(-1)
+
+            if step < min_length:
+                log_probs[:, self.end_token] = -1e20
+
+            # Multiply probs by the beam probability.
+
+            length_penalty = ((5.0 + (step + 1)) / 6.0)**self.alpha
+            # '''
+            if self.args.sample_topk:
+                temperature = self.args.temperature
+                _scores = log_probs / temperature
+                _scores = self._top_k_top_p_filtering(
+                    _scores,
+                    top_k=self.args.top_k,
+                    top_p=self.args.top_p,
+                    min_tokens_to_keep=1
+                )  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next words for each beam (so we have some spare tokens
+                # and match output of greedy beam search)
+                topk_ids = torch.multinomial(
+                    F.softmax(_scores, dim=-1),
+                    num_samples=1)  # (batch_size * num_beams, 2)
+                # Compute next scores
+                _scores = F.log_softmax(
+                    _scores, dim=1)  # (batch_size * num_beams, vocab_size)
+
+                _scores += topk_log_probs.view(-1).unsqueeze(1)
+                _scores = _scores / length_penalty
+                topk_scores = torch.gather(
+                    _scores, -1, topk_ids)  # (batch_size * num_beams, 2)
+                # log_probs +=   # (batch_size * num_beams, 2)
+                # Match shape of greedy beam search
+                topk_ids = topk_ids.view(
+                    -1, beam_size)  # (batch_size, 2 * num_beams)
+                topk_scores = topk_scores.view(
+                    -1, beam_size)  # (batch_size, 2 * num_beams)
+            # '''
+            else:
+                log_probs += topk_log_probs.view(-1).unsqueeze(1)
+                curr_scores = log_probs / length_penalty
+
+                curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
+                topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
+            if self.args.block_trigram:
+                cur_len = alive_seq.size(1)
+                if cur_len > 3:
+                    for i in range(alive_seq.size(0)):
+                        fail = False
+                        words = [int(w) for w in alive_seq[i]]
+                        if self.args.encoder == 'roberta':
+                            # words = [self.vocab.convert_ids_to_tokens[w] for w in words]
+                            words = self.vocab.decode(words).strip().split()
+                        else:
+                            words = [
+                                self.vocab.ids_to_tokens[w] for w in words
+                            ]
+                            words = ' '.join(words).replace(' ##', '').split()
+                        if len(words) <= 3:
+                            continue
+                        trigrams = [(words[i - 1], words[i], words[i + 1])
+                                    for i in range(1,
+                                                   len(words) - 1)]
+                        trigram = tuple(trigrams[-1])
+                        if trigram in trigrams[:-1]:
+                            fail = True
+                        if fail:
+                            curr_scores[i] = -10e20
+            # Recover log probs.
+            topk_log_probs = topk_scores * length_penalty
+
+            # Resolve beam origin and true word ids.
+            # topk_beam_index = topk_ids.div(vocab_size)
+            topk_beam_index = topk_ids // vocab_size
+            topk_ids = topk_ids.fmod(vocab_size)
+
+            # Map beam_index to batch_index in the flat representation.
+            batch_index = (
+                topk_beam_index
+                + beam_offset[:topk_beam_index.size(0)].unsqueeze(1))
+            select_indices = batch_index.view(-1)
+
+            # Append last prediction.
+            alive_seq = torch.cat([
+                alive_seq.index_select(0, select_indices),
+                topk_ids.view(-1, 1)
+            ], -1)
+
+            is_finished = topk_ids.eq(self.end_token)
+            if step + 1 == max_length:
+                is_finished.fill_(self.end_token)
+            # End condition is top beam is finished.
+            end_condition = is_finished[:, 0].eq(1)
+            # Save finished hypotheses.
+            if is_finished.any():
+                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
+                for i in range(is_finished.size(0)):
+                    b = batch_offset[i]
+                    if end_condition[i]:
+                        is_finished[i].fill_(self.end_token)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+                    # Store finished hypotheses for this batch.
+                    for j in finished_hyp:
+                        hypotheses[b].append(
+                            (topk_scores[i, j], predictions[i, j, 1:]))
+                    # If the batch reached the end, save the n_best hypotheses.
+                    if end_condition[i]:
+                        best_hyp = sorted(
+                            hypotheses[b], key=lambda x: x[0], reverse=True)
+                        if self.args.dataset == 'qg_ranking_test' or (
+                                self.args.dataset == 'paraphrase'
+                                and not self.args.sample_topk):
+                            for each in best_hyp[:beam_size]:
+                                score, pred = each
+                                results['scores'][b].append(score)
+                                results['predictions'][b].append(pred)
+                        else:
+                            score, pred = best_hyp[0]
+                            results['scores'][b].append(score)
+                            results['predictions'][b].append(pred)
+                non_finished = end_condition.eq(0).nonzero().view(-1)
+                # If all sentences are translated, no need to go further.
+                if len(non_finished) == 0:
+                    break
+                # Remove finished batches for the next step.
+                topk_log_probs = topk_log_probs.index_select(0, non_finished)
+                batch_index = batch_index.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                alive_seq = predictions.index_select(0, non_finished) \
+                    .view(-1, alive_seq.size(-1))
+            # Reorder states.
+            select_indices = batch_index.view(-1)
+            src_features = src_features.index_select(0, select_indices)
+            self.model.decoder.state.map_batch_fn(
+                lambda state, dim: state.index_select(dim, select_indices))
+
+        return results
+
+    def forward(self, input_ids: torch.Tensor,
+                attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
+        batch = self.Batch(
+            batch_size=input_ids.size()[0],
+            src=input_ids,
+            tgt=None,
+            mask_src=attention_mask)
+        translation_batch = self.translate_batch(batch)
+
+        preds = translation_batch['predictions']
+        return {'predictions': preds}
diff --git a/modelscope/models/nlp/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
similarity index 96%
rename from modelscope/models/nlp/palm_for_text_generation.py
rename to modelscope/models/nlp/palm_v2/palm_for_text_generation.py
index 23d60663..7f8e918b 100644
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_v2/palm_for_text_generation.py
@@ -22,8 +22,8 @@ class PalmForTextGeneration(TorchModel):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        from sofa.models.palm_v2 import (PalmForConditionalGeneration,
-                                         Translator)
+        from modelscope.models.nlp.palm_v2 import (
+            PalmForConditionalGeneration, Translator)
         self.model = PalmForConditionalGeneration.from_pretrained(model_dir)
         self.tokenizer = self.model.tokenizer
         self.generator = Translator(self.model)
diff --git a/modelscope/models/nlp/sbert_for_nli.py b/modelscope/models/nlp/sbert_for_nli.py
deleted file mode 100644
index ea62a8bd..00000000
--- a/modelscope/models/nlp/sbert_for_nli.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-from .sbert_for_sequence_classification import \
-    SbertForSequenceClassificationBase
-
-__all__ = ['SbertForNLI']
-
-
-@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
-class SbertForNLI(SbertForSequenceClassificationBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(
-            model_dir, *args, model_args={'num_labels': 3}, **kwargs)
-        assert self.model.config.num_labels == 3
diff --git a/modelscope/models/nlp/sbert_for_sentence_similarity.py b/modelscope/models/nlp/sbert_for_sentence_similarity.py
deleted file mode 100644
index 00b612ea..00000000
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-from .sbert_for_sequence_classification import \
-    SbertForSequenceClassificationBase
-
-__all__ = ['SbertForSentenceSimilarity']
-
-
-@MODELS.register_module(
-    Tasks.sentence_similarity, module_name=Models.structbert)
-class SbertForSentenceSimilarity(SbertForSequenceClassificationBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the sentence similarity model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(
-            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
-        self.model_dir = model_dir
-        assert self.model.config.num_labels == 2
diff --git a/modelscope/models/nlp/sbert_for_sentiment_classification.py b/modelscope/models/nlp/sbert_for_sentiment_classification.py
deleted file mode 100644
index 83ac93c5..00000000
--- a/modelscope/models/nlp/sbert_for_sentiment_classification.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-from .sbert_for_sequence_classification import \
-    SbertForSequenceClassificationBase
-
-__all__ = ['SbertForSentimentClassification']
-
-
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.structbert)
-class SbertForSentimentClassification(SbertForSequenceClassificationBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-        super().__init__(
-            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
-        assert self.model.config.num_labels == 2
diff --git a/modelscope/models/nlp/sbert_for_sequence_classification.py b/modelscope/models/nlp/sbert_for_sequence_classification.py
deleted file mode 100644
index 59fcf6fa..00000000
--- a/modelscope/models/nlp/sbert_for_sequence_classification.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import os
-from typing import Any, Dict
-
-import json
-import numpy as np
-import torch
-from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel
-from torch import nn
-
-from modelscope.models import TorchModel
-
-
-class SbertTextClassfier(SbertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.encoder = SbertModel(config, add_pooling_layer=True)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                labels=None,
-                **kwargs):
-        outputs = self.encoder(
-            input_ids,
-            token_type_ids=token_type_ids,
-            return_dict=None,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return {'logits': logits, 'loss': loss}
-        return {'logits': logits}
-
-    def build(**kwags):
-        return SbertTextClassfier.from_pretrained(model_dir, **model_args)
-
-
-class SbertForSequenceClassificationBase(TorchModel):
-
-    def __init__(self, model_dir: str, model_args=None, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-        if model_args is None:
-            model_args = {}
-        self.model = SbertTextClassfier.from_pretrained(
-            model_dir, **model_args)
-        self.id2label = {}
-        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
-        token_type_ids = torch.tensor(
-            input['token_type_ids'], dtype=torch.long)
-        return self.model.forward(input_ids, token_type_ids)
-
-    def postprocess(self, input, **kwargs):
-        logits = input['logits']
-        probs = logits.softmax(-1).cpu().numpy()
-        pred = logits.argmax(-1).cpu().numpy()
-        logits = logits.cpu().numpy()
-        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
-        return res
diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py
deleted file mode 100644
index 748c4107..00000000
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SbertForTokenClassification']
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
-class SbertForTokenClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the word segmentation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-        self.model_dir = model_dir
-        import sofa
-        self.model = sofa.SbertForTokenClassification.from_pretrained(
-            self.model_dir)
-        self.config = sofa.SbertConfig.from_pretrained(self.model_dir)
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(self, input: Dict[str,
-                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, Union[str,np.ndarray]]: results
-                Example:
-                    {
-                        'predictions': array([1,4]), # lable 0-negative 1-positive
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                        'text': str(今天),
-                    }
-        """
-        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
-        return {**self.model(input_ids), 'text': input['text']}
-
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
-        logits = input['logits']
-        pred = torch.argmax(logits[0], dim=-1)
-        pred = pred.cpu().numpy()
-        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
-        return rst
diff --git a/modelscope/models/nlp/sbert_for_zero_shot_classification.py b/modelscope/models/nlp/sbert_for_zero_shot_classification.py
deleted file mode 100644
index b772cf45..00000000
--- a/modelscope/models/nlp/sbert_for_zero_shot_classification.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from typing import Any, Dict
-
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SbertForZeroShotClassification']
-
-
-@MODELS.register_module(
-    Tasks.zero_shot_classification, module_name=Models.structbert)
-class SbertForZeroShotClassification(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the zero shot classification model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-        from sofa import SbertForSequenceClassification
-        self.model = SbertForSequenceClassification.from_pretrained(model_dir)
-
-    def train(self):
-        return self.model.train()
-
-    def eval(self):
-        return self.model.eval()
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        outputs = self.model(**input)
-        logits = outputs['logits'].cpu().numpy()
-        res = {'logits': logits}
-        return res
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
index 4920c6ff..5550d749 100644
--- a/modelscope/models/nlp/sequence_classification.py
+++ b/modelscope/models/nlp/sequence_classification.py
@@ -1,85 +1,174 @@
-import os
-from typing import Any, Dict
+from abc import abstractmethod
 
-import json
-import numpy as np
+from torch import nn
 
-from modelscope.metainfo import TaskModels
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import SbertPreTrainedModel
+from modelscope.models.nlp.veco import \
+    VecoForSequenceClassification as VecoForSequenceClassificationTransform
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
-from .task_model import SingleBackboneTaskModelBase
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
-__all__ = ['SequenceClassificationModel']
+__all__ = ['SbertForSequenceClassification', 'VecoForSequenceClassification']
 
 
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=TaskModels.text_classification)
-@MODELS.register_module(
-    Tasks.text_classification, module_name=TaskModels.text_classification)
-class SequenceClassificationModel(SingleBackboneTaskModelBase):
+class SequenceClassificationBase(TorchModel):
+    base_model_prefix: str = 'bert'
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.num_labels = config.num_labels
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the sequence classification model from the `model_dir` path.
+    @abstractmethod
+    def build_base_model(self):
+        """Build the backbone model.
 
-        Args:
-            model_dir (str): the model path.
+        Returns: the backbone instance.
         """
-        super().__init__(model_dir, *args, **kwargs)
-        if 'base_model_prefix' in kwargs:
-            self._base_model_prefix = kwargs['base_model_prefix']
-
-        backbone_cfg = self.cfg.backbone
-        head_cfg = self.cfg.head
-
-        # get the num_labels from label_mapping.json
-        self.id2label = {}
-        self.label_path = os.path.join(model_dir, 'label_mapping.json')
-        if os.path.exists(self.label_path):
-            with open(self.label_path) as f:
-                self.label_mapping = json.load(f)
-            self.id2label = {
-                idx: name
-                for name, idx in self.label_mapping.items()
-            }
-        head_cfg['num_labels'] = len(self.label_mapping)
-
-        self.build_backbone(backbone_cfg)
-        self.build_head(head_cfg)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
-        outputs = self.head.forward(pooled_output)
-        if 'labels' in input:
-            loss = self.compute_loss(outputs, input['labels'])
-            outputs.update(loss)
-        return outputs
-
-    def extract_logits(self, outputs):
-        return outputs[OutputKeys.LOGITS].cpu().detach()
-
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        if hasattr(self.backbone, 'extract_pooled_outputs'):
-            pooled_output = self.backbone.extract_pooled_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def compute_loss(self, outputs, labels):
-        loss = self.head.compute_loss(outputs, labels)
-        return loss
+        pass
+
+    @property
+    def base_model(self):
+        return getattr(self, self.base_model_prefix)
+
+    def forward(self, **kwargs):
+        labels = None
+        if OutputKeys.LABEL in kwargs:
+            labels = kwargs.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in kwargs:
+            labels = kwargs.pop(OutputKeys.LABELS)
+
+        outputs = self.base_model.forward(**kwargs)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
 
     def postprocess(self, input, **kwargs):
-        logits = self.extract_logits(input)
-        probs = logits.softmax(-1).numpy()
-        pred = logits.argmax(-1).numpy()
-        logits = logits.numpy()
+        logits = input[OutputKeys.LOGITS]
+        probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1)))
+        pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1)))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
         res = {
             OutputKeys.PREDICTIONS: pred,
             OutputKeys.PROBABILITIES: probs,
             OutputKeys.LOGITS: logits
         }
         return res
+
+
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassification(SequenceClassificationBase,
+                                     SbertPreTrainedModel):
+    base_model_prefix: str = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            SbertForSequenceClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                labels=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=labels)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(SbertPreTrainedModel,
+                     SbertForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
+
+
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.veco)
+@MODELS.register_module(Tasks.nli, module_name=Models.veco)
+class VecoForSequenceClassification(TorchModel,
+                                    VecoForSequenceClassificationTransform):
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        VecoForSequenceClassificationTransform.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                **kwargs):
+        return VecoForSequenceClassificationTransform.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            labels=labels)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(VecoForSequenceClassificationTransform,
+                     VecoForSequenceClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py
new file mode 100644
index 00000000..45f856c1
--- /dev/null
+++ b/modelscope/models/nlp/space/__init__.py
@@ -0,0 +1,28 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .model import SpaceGenerator
+    from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig
+    from .space_for_dialog_intent_prediction import SpaceForDialogIntent
+    from .space_for_dialog_modeling import SpaceForDialogModeling
+    from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
+else:
+    _import_structure = {
+        'model':
+        ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'],
+        'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
+        'space_for_dialog_modeling': ['SpaceForDialogModeling'],
+        'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
new file mode 100644
index 00000000..24641f06
--- /dev/null
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -0,0 +1,10 @@
+from .configuration_space import SpaceConfig
+from .gen_unified_transformer import GenUnifiedTransformer
+from .generator import Generator as SpaceGenerator
+from .intent_unified_transformer import IntentUnifiedTransformer
+from .model_base import SpaceModelBase
+from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
+                             SpaceForPreTraining, SpaceModel)
+from .tokenization_space import (BasicTokenizer, SpaceTokenizer,
+                                 WordpieceTokenizer)
+from .unified_transformer import UnifiedTransformer
diff --git a/modelscope/models/nlp/space/model/configuration_space.py b/modelscope/models/nlp/space/model/configuration_space.py
new file mode 100644
index 00000000..0da2d629
--- /dev/null
+++ b/modelscope/models/nlp/space/model/configuration_space.py
@@ -0,0 +1,32 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Space configuration, mainly copied from :class:`~transformers.configuration_xlm_roberta` """
+
+from modelscope.models.nlp.structbert import SbertConfig
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class SpaceConfig(SbertConfig):
+    """
+    This class overrides [`SbertConfig`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    model_type = 'space'
diff --git a/modelscope/models/nlp/backbones/space/model/gen_unified_transformer.py b/modelscope/models/nlp/space/model/gen_unified_transformer.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/gen_unified_transformer.py
rename to modelscope/models/nlp/space/model/gen_unified_transformer.py
diff --git a/modelscope/models/nlp/backbones/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/generator.py
rename to modelscope/models/nlp/space/model/generator.py
diff --git a/modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py b/modelscope/models/nlp/space/model/intent_unified_transformer.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/intent_unified_transformer.py
rename to modelscope/models/nlp/space/model/intent_unified_transformer.py
diff --git a/modelscope/models/nlp/backbones/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/model/model_base.py
rename to modelscope/models/nlp/space/model/model_base.py
diff --git a/modelscope/models/nlp/space/model/modeling_space.py b/modelscope/models/nlp/space/model/modeling_space.py
new file mode 100644
index 00000000..f093cbc5
--- /dev/null
+++ b/modelscope/models/nlp/space/model/modeling_space.py
@@ -0,0 +1,268 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.file_utils import add_start_docstrings
+
+from modelscope.models.nlp.structbert.modeling_sbert import (
+    SbertForMaskedLM, SbertModel, SbertPreTrainedModel)
+from .configuration_space import SpaceConfig
+
+SPACE_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`SpaceConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+
+@add_start_docstrings(
+    'The bare Space Model transformer outputting raw hidden-states without any specific head on top. '
+    'It is identical with the Bert Model from Transformers',
+    SPACE_START_DOCSTRING,
+)
+class SpaceModel(SbertModel):
+    """
+    This class overrides [`SbertModel`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = SpaceConfig
+
+
+@add_start_docstrings(
+    """
+    Space Model transformer with Dialog state tracking heads on top (a inform projection
+    layer with a dialog state layer and a set of slots including history infromation from
+    previous dialog) e.g. for multiwoz2.2 tasks.
+    """,
+    SPACE_START_DOCSTRING,
+)
+class SpaceForDST(SbertPreTrainedModel):
+
+    def __init__(self, config):
+        super(SpaceForDST, self).__init__(config)
+        self.slot_list = config.dst_slot_list
+        self.class_types = config.dst_class_types
+        self.class_labels = config.dst_class_labels
+        self.token_loss_for_nonpointable = config.dst_token_loss_for_nonpointable
+        self.refer_loss_for_nonpointable = config.dst_refer_loss_for_nonpointable
+        self.class_aux_feats_inform = config.dst_class_aux_feats_inform
+        self.class_aux_feats_ds = config.dst_class_aux_feats_ds
+        self.class_loss_ratio = config.dst_class_loss_ratio
+
+        # Only use refer loss if refer class is present in dataset.
+        if 'refer' in self.class_types:
+            self.refer_index = self.class_types.index('refer')
+        else:
+            self.refer_index = -1
+
+        self.bert = SpaceModel(config)
+        self.dropout = nn.Dropout(config.dst_dropout_rate)
+        self.dropout_heads = nn.Dropout(config.dst_heads_dropout_rate)
+
+        if self.class_aux_feats_inform:
+            self.add_module(
+                'inform_projection',
+                nn.Linear(len(self.slot_list), len(self.slot_list)))
+        if self.class_aux_feats_ds:
+            self.add_module(
+                'ds_projection',
+                nn.Linear(len(self.slot_list), len(self.slot_list)))
+
+        aux_dims = len(self.slot_list) * (
+            self.class_aux_feats_inform + self.class_aux_feats_ds
+        )  # second term is 0, 1 or 2
+
+        for slot in self.slot_list:
+            self.add_module(
+                'class_' + slot,
+                nn.Linear(config.hidden_size + aux_dims, self.class_labels))
+            self.add_module('token_' + slot, nn.Linear(config.hidden_size, 2))
+            self.add_module(
+                'refer_' + slot,
+                nn.Linear(config.hidden_size + aux_dims,
+                          len(self.slot_list) + 1))
+
+        self.init_weights()
+
+    def forward(self,
+                input_ids,
+                input_mask=None,
+                segment_ids=None,
+                position_ids=None,
+                head_mask=None,
+                start_pos=None,
+                end_pos=None,
+                inform_slot_id=None,
+                refer_id=None,
+                class_label_id=None,
+                diag_state=None):
+        outputs = self.bert(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+        pooled_output = outputs[1]
+
+        sequence_output = self.dropout(sequence_output)
+        pooled_output = self.dropout(pooled_output)
+
+        # TODO: establish proper format in labels already?
+        if inform_slot_id is not None:
+            inform_labels = torch.stack(list(inform_slot_id.values()),
+                                        1).float()
+        if diag_state is not None:
+            diag_state_labels = torch.clamp(
+                torch.stack(list(diag_state.values()), 1).float(), 0.0, 1.0)
+
+        total_loss = 0
+        per_slot_per_example_loss = {}
+        per_slot_class_logits = {}
+        per_slot_start_logits = {}
+        per_slot_end_logits = {}
+        per_slot_refer_logits = {}
+        for slot in self.slot_list:
+            if self.class_aux_feats_inform and self.class_aux_feats_ds:
+                pooled_output_aux = torch.cat(
+                    (pooled_output, self.inform_projection(inform_labels),
+                     self.ds_projection(diag_state_labels)), 1)
+            elif self.class_aux_feats_inform:
+                pooled_output_aux = torch.cat(
+                    (pooled_output, self.inform_projection(inform_labels)), 1)
+            elif self.class_aux_feats_ds:
+                pooled_output_aux = torch.cat(
+                    (pooled_output, self.ds_projection(diag_state_labels)), 1)
+            else:
+                pooled_output_aux = pooled_output
+            class_logits = self.dropout_heads(
+                getattr(self, 'class_' + slot)(pooled_output_aux))
+
+            token_logits = self.dropout_heads(
+                getattr(self, 'token_' + slot)(sequence_output))
+            start_logits, end_logits = token_logits.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1)
+            end_logits = end_logits.squeeze(-1)
+
+            refer_logits = self.dropout_heads(
+                getattr(self, 'refer_' + slot)(pooled_output_aux))
+
+            per_slot_class_logits[slot] = class_logits
+            per_slot_start_logits[slot] = start_logits
+            per_slot_end_logits[slot] = end_logits
+            per_slot_refer_logits[slot] = refer_logits
+
+            # If there are no labels, don't compute loss
+            if class_label_id is not None and start_pos is not None and end_pos is not None and refer_id is not None:
+                # If we are on multi-GPU, split add a dimension
+                if len(start_pos[slot].size()) > 1:
+                    start_pos[slot] = start_pos[slot].squeeze(-1)
+                if len(end_pos[slot].size()) > 1:
+                    end_pos[slot] = end_pos[slot].squeeze(-1)
+                # sometimes the start/end positions are outside our model inputs, we ignore these terms
+                ignored_index = start_logits.size(1)  # This is a single index
+                start_pos[slot].clamp_(0, ignored_index)
+                end_pos[slot].clamp_(0, ignored_index)
+
+                class_loss_fct = CrossEntropyLoss(reduction='none')
+                token_loss_fct = CrossEntropyLoss(
+                    reduction='none', ignore_index=ignored_index)
+                refer_loss_fct = CrossEntropyLoss(reduction='none')
+
+                start_loss = token_loss_fct(start_logits, start_pos[slot])
+                end_loss = token_loss_fct(end_logits, end_pos[slot])
+                token_loss = (start_loss + end_loss) / 2.0
+
+                token_is_pointable = (start_pos[slot] > 0).float()
+                if not self.token_loss_for_nonpointable:
+                    token_loss *= token_is_pointable
+
+                refer_loss = refer_loss_fct(refer_logits, refer_id[slot])
+                token_is_referrable = torch.eq(class_label_id[slot],
+                                               self.refer_index).float()
+                if not self.refer_loss_for_nonpointable:
+                    refer_loss *= token_is_referrable
+
+                class_loss = class_loss_fct(class_logits, class_label_id[slot])
+
+                if self.refer_index > -1:
+                    per_example_loss = (self.class_loss_ratio) * class_loss + (
+                        (1 - self.class_loss_ratio) / 2) * token_loss + (
+                            (1 - self.class_loss_ratio) / 2) * refer_loss
+                else:
+                    per_example_loss = self.class_loss_ratio * class_loss + (
+                        1 - self.class_loss_ratio) * token_loss
+
+                total_loss += per_example_loss.sum()
+                per_slot_per_example_loss[slot] = per_example_loss
+
+        # add hidden states and attention if they are here
+        outputs = (total_loss, ) + (
+            per_slot_per_example_loss,
+            per_slot_class_logits,
+            per_slot_start_logits,
+            per_slot_end_logits,
+            per_slot_refer_logits,
+        ) + outputs[2:]
+
+        return outputs
+
+
+@add_start_docstrings(
+    'The Space Model Model with a `language modeling` head on tops',
+    SPACE_START_DOCSTRING,
+)
+class SpaceForMaskedLM(SbertForMaskedLM):
+    """
+    This class overrides [`SbertForMaskedLM`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = SpaceConfig
+
+
+@add_start_docstrings(
+    """
+    Space Model with only one head on top as done during the pretraining: a `masked language modeling` head.
+    """,
+    SPACE_START_DOCSTRING,
+)
+class SpaceForPreTraining(SbertPreTrainedModel):
+
+    def __init__(self, model_name_or_path: str):
+        super(SpaceForPreTraining, self).__init__()
+        self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path)
+
+    def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor):
+        outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels)
+        return outputs[0]
diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py
new file mode 100644
index 00000000..84712b7b
--- /dev/null
+++ b/modelscope/models/nlp/space/model/tokenization_space.py
@@ -0,0 +1,29 @@
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for Space. mainly copied from :module:`~transformers.tokenization_xlm_roberta`"""
+
+from modelscope.models.nlp.structbert import (BasicTokenizer, SbertTokenizer,
+                                              WordpieceTokenizer)
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class SpaceTokenizer(SbertTokenizer):
+    """
+    This class overrides [`SpaceTokenizer`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
diff --git a/modelscope/models/nlp/backbones/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py
similarity index 97%
rename from modelscope/models/nlp/backbones/space/model/unified_transformer.py
rename to modelscope/models/nlp/space/model/unified_transformer.py
index f5df954d..b0775541 100644
--- a/modelscope/models/nlp/backbones/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -5,10 +5,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modelscope.models.nlp.backbones.space.model.model_base import \
-    SpaceModelBase
-from modelscope.models.nlp.backbones.space.modules.embedder import Embedder
-from modelscope.models.nlp.backbones.space.modules.transformer_block import \
+from modelscope.models.nlp.space.model.model_base import SpaceModelBase
+from modelscope.models.nlp.space.modules.embedder import Embedder
+from modelscope.models.nlp.space.modules.transformer_block import \
     TransformerBlock
 
 
diff --git a/modelscope/models/nlp/backbones/space/modules/__init__.py b/modelscope/models/nlp/space/modules/__init__.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/__init__.py
rename to modelscope/models/nlp/space/modules/__init__.py
diff --git a/modelscope/models/nlp/backbones/space/modules/embedder.py b/modelscope/models/nlp/space/modules/embedder.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/embedder.py
rename to modelscope/models/nlp/space/modules/embedder.py
diff --git a/modelscope/models/nlp/backbones/space/modules/feedforward.py b/modelscope/models/nlp/space/modules/feedforward.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/feedforward.py
rename to modelscope/models/nlp/space/modules/feedforward.py
diff --git a/modelscope/models/nlp/backbones/space/modules/functions.py b/modelscope/models/nlp/space/modules/functions.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/functions.py
rename to modelscope/models/nlp/space/modules/functions.py
diff --git a/modelscope/models/nlp/backbones/space/modules/multihead_attention.py b/modelscope/models/nlp/space/modules/multihead_attention.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/multihead_attention.py
rename to modelscope/models/nlp/space/modules/multihead_attention.py
diff --git a/modelscope/models/nlp/backbones/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py
similarity index 100%
rename from modelscope/models/nlp/backbones/space/modules/transformer_block.py
rename to modelscope/models/nlp/space/modules/transformer_block.py
diff --git a/modelscope/models/nlp/space_for_dialog_intent_prediction.py b/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
similarity index 97%
rename from modelscope/models/nlp/space_for_dialog_intent_prediction.py
rename to modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
index bd0eb63b..c862fbef 100644
--- a/modelscope/models/nlp/space_for_dialog_intent_prediction.py
+++ b/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
@@ -7,7 +7,7 @@ from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.backbones import SpaceGenerator, SpaceModelBase
+from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
 from modelscope.preprocessors.space import IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
diff --git a/modelscope/models/nlp/space_for_dialog_modeling.py b/modelscope/models/nlp/space/space_for_dialog_modeling.py
similarity index 97%
rename from modelscope/models/nlp/space_for_dialog_modeling.py
rename to modelscope/models/nlp/space/space_for_dialog_modeling.py
index 60713c3d..8b9ed8b3 100644
--- a/modelscope/models/nlp/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/space_for_dialog_modeling.py
@@ -7,7 +7,7 @@ from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.backbones import SpaceGenerator, SpaceModelBase
+from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
 from modelscope.preprocessors.space import MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
diff --git a/modelscope/models/nlp/space_for_dialog_state_tracking.py b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
similarity index 97%
rename from modelscope/models/nlp/space_for_dialog_state_tracking.py
rename to modelscope/models/nlp/space/space_for_dialog_state_tracking.py
index de5f95ce..ee7356b1 100644
--- a/modelscope/models/nlp/space_for_dialog_state_tracking.py
+++ b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
@@ -21,7 +21,7 @@ class SpaceForDialogStateTracking(TorchModel):
 
         super().__init__(model_dir, *args, **kwargs)
 
-        from sofa.models.space import SpaceConfig, SpaceForDST
+        from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig
         self.model_dir = model_dir
 
         self.config = SpaceConfig.from_pretrained(self.model_dir)
diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py
new file mode 100644
index 00000000..d42db83c
--- /dev/null
+++ b/modelscope/models/nlp/structbert/__init__.py
@@ -0,0 +1,45 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_sbert import SbertConfig
+    from .modeling_sbert import (SbertForMaskedLM, SbertModel,
+                                 SbertPreTrainedModel)
+    from .tokenization_sbert import (BasicTokenizer, SbertTokenizer,
+                                     WordpieceTokenizer)
+    from .tokenization_sbert_fast import SbertTokenizerFast
+else:
+    _import_structure = {
+        'configuration_sbert': ['SbertConfig'],
+        'modeling_sbert':
+        ['SbertForMaskedLM', 'SbertModel', 'SbertPreTrainedModel'],
+        'tokenization_sbert':
+        ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'],
+        'tokenization_sbert_fast': ['SbertTokenizerFast'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/backbones/structbert/adv_utils.py b/modelscope/models/nlp/structbert/adv_utils.py
similarity index 96%
rename from modelscope/models/nlp/backbones/structbert/adv_utils.py
rename to modelscope/models/nlp/structbert/adv_utils.py
index 9864148f..44aae85c 100644
--- a/modelscope/models/nlp/backbones/structbert/adv_utils.py
+++ b/modelscope/models/nlp/structbert/adv_utils.py
@@ -59,7 +59,8 @@ def compute_adv_loss(embedding,
     """
     Calculate the adv loss of the model.
     :param embedding: Original sentense embedding
-    :param model: The model or the forward function(including decoder/classifier), accept kwargs as input, output logits
+    :param model: The model, or the forward function(including decoder/classifier),
+            accept kwargs as input, output logits
     :param ori_logits: The original logits outputed from the model function
     :param ori_loss: The original loss
     :param adv_grad_factor: This factor will be multipled by the KL loss grad and then the result will be added to
@@ -119,7 +120,8 @@ def compute_adv_loss_pair(embedding,
     """
     Calculate the adv loss of the model. This function is used in the pair logits scenerio.
     :param embedding: Original sentense embedding
-    :param model: The model or the forward function(including decoder/classifier), accept kwargs as input, output logits
+    :param model: The model, or the forward function(including decoder/classifier),
+            accept kwargs as input, output logits
     :param start_logits: The original start logits outputed from the model function
     :param end_logits: The original end logits outputed from the model function
     :param ori_loss: The original loss
diff --git a/modelscope/models/nlp/backbones/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration_sbert.py
similarity index 94%
rename from modelscope/models/nlp/backbones/structbert/configuration_sbert.py
rename to modelscope/models/nlp/structbert/configuration_sbert.py
index 878b2216..374d4b62 100644
--- a/modelscope/models/nlp/backbones/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration_sbert.py
@@ -24,11 +24,12 @@ logger = logging.get_logger(__name__)
 
 class SbertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~sofa.models.SbertModel`.
+    This is the configuration class to store the configuration
+    of a :class:`~modelscope.models.nlp.structbert.SbertModel`.
     It is used to instantiate a SBERT model according to the specified arguments.
 
-    Configuration objects inherit from :class:`~sofa.utils.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~sofa.utils.PretrainedConfig` for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
@@ -99,11 +100,13 @@ class SbertConfig(PretrainedConfig):
                  type_vocab_size=2,
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
+                 pad_token_id=0,
                  position_embedding_type='absolute',
                  use_cache=True,
                  classifier_dropout=None,
                  **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py
new file mode 100755
index 00000000..bbac3c95
--- /dev/null
+++ b/modelscope/models/nlp/structbert/modeling_sbert.py
@@ -0,0 +1,1964 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SBERT model. mainly copied from :module:`~transformers.modeling_bert`"""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils.constant import Fields
+from modelscope.utils.logger import get_logger
+from .adv_utils import compute_adv_loss, compute_adv_loss_pair
+from .configuration_sbert import SbertConfig
+
+logger = get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = 'chinese_sbert-large-std-512'
+_CONFIG_FOR_DOC = 'SbertConfig'
+_TOKENIZER_FOR_DOC = 'SbertTokenizer'
+
+
+class SbertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                return_inputs_embeds=False):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users
+        # when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if not return_inputs_embeds:
+            return embeddings
+        else:
+            return embeddings, inputs_embeds
+
+
+class SbertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class SbertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = SbertSelfAttention(config)
+        self.output = SbertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class SbertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class SbertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = SbertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = SbertAttention(config)
+        self.intermediate = SbertIntermediate(config)
+        self.output = SbertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
+                    f'layers by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SbertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SbertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SbertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class SbertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = SbertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class SbertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class SbertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class SbertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class SbertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SbertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, SbertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class SbertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.BertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``
+            is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True``
+            is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+SBERT_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+SBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+"""
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
+        BaseModelOutputWithPoolingAndCrossAttentions):
+    embedding_output: torch.FloatTensor = None
+    logits: Optional[Union[tuple, torch.FloatTensor]] = None
+    kwargs: dict = None
+
+
+@add_start_docstrings(
+    'The Sbert Model transformer outputting raw hidden-states without any specific head on top.',
+    SBERT_START_DOCSTRING,
+)
+class SbertModel(SbertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config: SbertConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = SbertEmbeddings(config)
+        self.encoder = SbertEncoder(config)
+
+        self.pooler = SbertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
+            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output, orignal_embeds = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            return_inputs_embeds=True,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output,
+                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+
+        return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+            embedding_output=orignal_embeds)
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForPreTraining(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        self.bert = SbertModel(config)
+        self.cls = SbertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=SbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores,
+                      seq_relationship_score) + outputs[2:-1]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return SbertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Sbert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    SBERT_START_DOCSTRING)
+class SbertLMHeadModel(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                'If you want to use `SbertLMHeadModel` as a standalone, add `is_decoder=True.`'
+            )
+
+        self.bert = SbertModel(config, add_pooling_layer=False)
+        self.cls = SbertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
+            with each tuple having 4 tensors of
+            shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:-1]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'past_key_values': past
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """Sbert Model with a `language modeling` head on top. """,
+    SBERT_START_DOCSTRING)
+class SbertForMaskedLM(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = SbertModel(config)
+        self.cls = SbertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:-1]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
+        attention_mask_zero = attention_mask.new_zeros(
+            (attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, attention_mask_zero],
+                                   dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+@add_start_docstrings(
+    """Sbert Model with a `next sentence prediction (classification)` head on top. """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForNextSentencePrediction(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+
+        self.bert = SbertModel(config)
+        self.cls = SbertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @replace_return_docstrings(
+        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        """
+
+        if 'next_sentence_label' in kwargs:
+            warnings.warn(
+                'The `next_sentence_label` argument is deprecated and will be removed '
+                'in a future version, use `labels` instead.',
+                FutureWarning,
+            )
+            labels = kwargs.pop('next_sentence_label')
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores, ) + outputs[2:-1]
+            return ((next_sentence_loss, )
+                    + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForSequenceClassification(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, labels, **outputs.kwargs)
+
+    def compute_loss(self, outputs, labels, **kwargs):
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+                if self.config.adv_grad_factor is not None and self.training:
+                    loss = compute_adv_loss(
+                        embedding=embedding_output,
+                        model=self._forward_call,
+                        ori_logits=logits,
+                        ori_loss=loss,
+                        adv_bound=self.config.adv_bound,
+                        adv_grad_factor=self.config.adv_grad_factor,
+                        sigma=self.config.sigma,
+                        **kwargs)
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForMultipleChoice(SbertPreTrainedModel):
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    def _forward_call(self, num_choices, **kwargs):
+        outputs = self.bert(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits.view(-1, num_choices)
+        kwargs['num_choices'] = num_choices
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format(
+            'batch_size, num_choices, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            num_choices=num_choices)
+
+        reshaped_logits = outputs.logits
+        kwargs = outputs.kwargs
+        embedding_output = outputs.embedding_output
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            if self.config.adv_grad_factor is not None and self.training:
+                loss = compute_adv_loss(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    ori_logits=reshaped_logits,
+                    ori_loss=loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    **kwargs)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForTokenClassification(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            if self.config.adv_grad_factor is not None and self.training:
+                loss = compute_adv_loss(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    ori_logits=logits,
+                    ori_loss=loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    with_attention_mask=attention_mask is not None,
+                    **outputs.kwargs)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Sbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    SBERT_START_DOCSTRING,
+)
+class SbertForQuestionAnswering(SbertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config: SbertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        self.bert = SbertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        outputs['logits'] = (start_logits, end_logits)
+        outputs.kwargs = kwargs
+        return outputs
+
+    @add_start_docstrings_to_model_forward(
+        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, start_positions, end_positions,
+                                 **outputs.kwargs)
+
+    def compute_loss(self,
+                     outputs,
+                     start_positions=None,
+                     end_positions=None,
+                     **kwargs):
+        start_logits, end_logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            if self.config.adv_grad_factor is not None and self.training:
+                total_loss = compute_adv_loss_pair(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    start_logits=start_logits,
+                    end_logits=end_logits,
+                    ori_loss=total_loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    **kwargs)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization_sbert.py
new file mode 100644
index 00000000..6db69509
--- /dev/null
+++ b/modelscope/models/nlp/structbert/tokenization_sbert.py
@@ -0,0 +1,516 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert`"""
+
+import collections
+import os
+import unicodedata
+from typing import List, Optional, Tuple
+
+from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control,
+                                             _is_punctuation, _is_whitespace)
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'chinese_sbert-large-std-512': 512,
+    'english_sbert-large-std-512': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'english_sbert-large-std-512': {
+        'do_lower_case': True
+    },
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, 'r', encoding='utf-8') as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip('\n')
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class SbertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a SBERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=True,
+                 do_basic_tokenize=True,
+                 never_split=None,
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 tokenize_chinese_chars=True,
+                 strip_accents=None,
+                 **kwargs):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
+                'model use `tokenizer = SbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([
+            (ids, tok) for tok, ids in self.vocab.items()
+        ])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(
+            vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                    text, never_split=self.all_special_tokens):
+
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A SBERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence
+        pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + '-' if filename_prefix else '')
+                + VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = (filename_prefix
+                          + '-' if filename_prefix else '') + save_directory
+        with open(vocab_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(
+                    self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f'Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive.'
+                        ' Please check that the vocabulary is not corrupted!')
+                    index = token_index
+                writer.write(token + '\n')
+                index += 1
+        return (vocab_file, )
+
+
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=None,
+                 tokenize_chinese_chars=True,
+                 strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(
+            set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF)
+                or (0x20000 <= cp <= 0x2A6DF) or (0x2A700 <= cp <= 0x2B73F)
+                or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF)
+                or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
new file mode 100644
index 00000000..b02039c6
--- /dev/null
+++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
@@ -0,0 +1,200 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization classes for Sbert. mainly copied from :module:`~transformers.tokenization_bert_fast`"""
+
+from typing import List, Optional, Tuple
+
+import json
+import transformers
+from tokenizers import normalizers
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils.logger import get_logger
+from .tokenization_sbert import SbertTokenizer
+
+logger = get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.txt',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file': {},
+    'tokenizer_file': {},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'chinese_sbert-large-std-512': 512,
+    'english_sbert-large-std-512': 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'english_sbert-large-std-512': {
+        'do_lower_case': True
+    },
+}
+
+transformers.SLOW_TO_FAST_CONVERTERS[
+    'SbertTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS['BertTokenizer']
+
+
+class SbertTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" SBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
+            issue <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+        wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = SbertTokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 do_lower_case=True,
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 tokenize_chinese_chars=True,
+                 strip_accents=None,
+                 **kwargs):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        pre_tok_state = json.loads(
+            self.backend_tokenizer.normalizer.__getstate__())
+        if (pre_tok_state.get('lowercase', do_lower_case) != do_lower_case
+                or pre_tok_state.get('strip_accents',
+                                     strip_accents) != strip_accents):
+            pre_tok_class = getattr(normalizers, pre_tok_state.pop('type'))
+            pre_tok_state['lowercase'] = do_lower_case
+            pre_tok_state['strip_accents'] = strip_accents
+            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
+
+        self.do_lower_case = do_lower_case
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A SBERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SBERT sequence
+        pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(
+            save_directory, name=filename_prefix)
+        return tuple(files)
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
new file mode 100644
index 00000000..988f2917
--- /dev/null
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -0,0 +1,86 @@
+import os
+from typing import Any, Dict
+
+import json
+import numpy as np
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['SequenceClassificationModel']
+
+
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=TaskModels.text_classification)
+@MODELS.register_module(
+    Tasks.text_classification, module_name=TaskModels.text_classification)
+class SequenceClassificationModel(SingleBackboneTaskModelBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the sequence classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        backbone_cfg = self.cfg.backbone
+        head_cfg = self.cfg.head
+
+        # get the num_labels from label_mapping.json
+        self.id2label = {}
+        self.label_path = os.path.join(model_dir, 'label_mapping.json')
+        if os.path.exists(self.label_path):
+            with open(self.label_path) as f:
+                self.label_mapping = json.load(f)
+            self.id2label = {
+                idx: name
+                for name, idx in self.label_mapping.items()
+            }
+        head_cfg['num_labels'] = len(self.label_mapping)
+
+        self.build_backbone(backbone_cfg)
+        self.build_head(head_cfg)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        outputs = super().forward(input)
+        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        outputs = self.head.forward(pooled_output)
+        if 'labels' in input:
+            loss = self.compute_loss(outputs, input['labels'])
+            outputs.update(loss)
+        return outputs
+
+    def extract_logits(self, outputs):
+        return outputs[OutputKeys.LOGITS].cpu().detach()
+
+    def extract_backbone_outputs(self, outputs):
+        sequence_output = None
+        pooled_output = None
+        if hasattr(self.backbone, 'extract_sequence_outputs'):
+            sequence_output = self.backbone.extract_sequence_outputs(outputs)
+        if hasattr(self.backbone, 'extract_pooled_outputs'):
+            pooled_output = self.backbone.extract_pooled_outputs(outputs)
+        return sequence_output, pooled_output
+
+    def compute_loss(self, outputs, labels):
+        loss = self.head.compute_loss(outputs, labels)
+        return loss
+
+    def postprocess(self, input, **kwargs):
+        logits = self.extract_logits(input)
+        probs = logits.softmax(-1).numpy()
+        pred = logits.argmax(-1).numpy()
+        logits = logits.numpy()
+        res = {
+            OutputKeys.PREDICTIONS: pred,
+            OutputKeys.PROBABILITIES: probs,
+            OutputKeys.LOGITS: logits
+        }
+        return res
diff --git a/modelscope/models/nlp/task_model.py b/modelscope/models/nlp/task_models/task_model.py
similarity index 98%
rename from modelscope/models/nlp/task_model.py
rename to modelscope/models/nlp/task_models/task_model.py
index e83c6604..104b4c32 100644
--- a/modelscope/models/nlp/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -11,8 +11,8 @@ from modelscope.models.base import TorchModel
 from modelscope.models.builder import build_backbone, build_head
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
-from modelscope.utils.utils import if_func_receive_dict_inputs
 
 logger = get_logger(__name__)
 
@@ -424,12 +424,15 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         """default forward method is the backbone-only forward"""
-        if if_func_receive_dict_inputs(self.backbone.forward):
+        if func_receive_dict_inputs(self.backbone.forward):
             outputs = self.backbone.forward(input)
         else:
             outputs = self.backbone.forward(**input)
         return outputs
 
+    def compute_loss(self, outputs: Dict[str, Any], labels):
+        raise NotImplementedError()
+
 
 class EncoderDecoderTaskModelBase(BaseTaskModel):
     """
@@ -472,13 +475,13 @@ class EncoderDecoderTaskModelBase(BaseTaskModel):
         return getattr(self, self._decoder_prefix)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        if if_func_receive_dict_inputs(self.encoder_.forward):
+        if func_receive_dict_inputs(self.encoder_.forward):
             encoder_outputs = self.encoder_.forward(input)
         else:
             encoder_outputs = self.encoder_.forward(**input)
         decoder_inputs = self.project_decoder_inputs_and_mediate(
             input, encoder_outputs)
-        if if_func_receive_dict_inputs(self.decoder_.forward):
+        if func_receive_dict_inputs(self.decoder_.forward):
             outputs = self.decoder_.forward(decoder_inputs)
         else:
             outputs = self.decoder_.forward(**decoder_inputs)
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
new file mode 100644
index 00000000..ebb1eda2
--- /dev/null
+++ b/modelscope/models/nlp/token_classification.py
@@ -0,0 +1,147 @@
+from abc import abstractmethod
+from typing import Dict
+
+import numpy as np
+import torch
+from torch import nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .structbert import SbertPreTrainedModel
+
+__all__ = ['SbertForTokenClassification']
+
+
+class TokenClassification(TorchModel):
+
+    base_model_prefix: str = 'bert'
+
+    def __init__(self, config, model_dir):
+        super().__init__(model_dir)
+        self.num_labels = config.num_labels
+        self.config = config
+        setattr(self, self.base_model_prefix, self.build_base_model())
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    @abstractmethod
+    def build_base_model(self):
+        """Build the backbone model.
+
+        Returns: the backbone instance.
+        """
+        pass
+
+    @property
+    def base_model(self):
+        return getattr(self, self.base_model_prefix)
+
+    def compute_loss(self, logits, labels, **kwargs):
+        """Compute loss.
+
+        For example, if backbone is pretrained model, there will be a 'attention_mask' parameter to skip
+        useless tokens.
+
+        Args:
+            logits: The logits from the classifier
+            labels: The labels
+            **kwargs: Other input params.
+
+        Returns: Loss.
+
+        """
+        pass
+
+    def forward(self, **kwargs):
+        labels = None
+        if OutputKeys.LABEL in kwargs:
+            labels = kwargs.pop(OutputKeys.LABEL)
+        elif OutputKeys.LABELS in kwargs:
+            labels = kwargs.pop(OutputKeys.LABELS)
+
+        outputs = self.base_model(**kwargs)
+        # base model should return the sequence_output as its first output
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        if labels is not None:
+            loss = self.compute_loss(logits, labels, **kwargs)
+            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
+        return {OutputKeys.LOGITS: logits}
+
+    def postprocess(self, input: Dict[str, np.ndarray],
+                    **kwargs) -> Dict[str, np.ndarray]:
+        logits = input[OutputKeys.LOGITS]
+        pred = torch.argmax(logits[0], dim=-1)
+        pred = torch_nested_numpify(torch_nested_detach(pred))
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        rst = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
+        return rst
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.token_classification, module_name=Models.structbert)
+class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
+
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config, model_dir):
+        if hasattr(config, 'base_model_prefix'):
+            SbertForTokenClassification.base_model_prefix = config.base_model_prefix
+        super().__init__(config, model_dir)
+
+    def build_base_model(self):
+        from .structbert import SbertModel
+        return SbertModel(self.config, add_pooling_layer=False)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                labels=None,
+                **kwargs):
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            labels=labels)
+
+    def compute_loss(self, logits, labels, attention_mask=None, **kwargs):
+        loss_fct = nn.CrossEntropyLoss()
+        # Only keep active parts of the loss
+        if attention_mask is not None:
+            active_loss = attention_mask.view(-1) == 1
+            active_logits = logits.view(-1, self.num_labels)
+            active_labels = torch.where(
+                active_loss, labels.view(-1),
+                torch.tensor(loss_fct.ignore_index).type_as(labels))
+            return loss_fct(active_logits, active_labels)
+        else:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(SbertPreTrainedModel,
+                     SbertForTokenClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py
new file mode 100644
index 00000000..0fe786fd
--- /dev/null
+++ b/modelscope/models/nlp/veco/__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration_veco import VecoConfig
+    from .modeling_veco import (VecoForMaskedLM, VecoForSequenceClassification,
+                                VecoModel)
+    from .tokenization_veco import VecoTokenizer
+    from .tokenization_veco_fast import VecoTokenizerFast
+else:
+    _import_structure = {
+        'configuration_veco': ['VecoConfig'],
+        'modeling_veco':
+        ['VecoForMaskedLM', 'VecoForSequenceClassification', 'VecoModel'],
+        'tokenization_veco': ['VecoTokenizer'],
+        'tokenization_veco_fast': ['VecoTokenizerFast'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/veco/configuration_veco.py b/modelscope/models/nlp/veco/configuration_veco.py
new file mode 100644
index 00000000..396755dc
--- /dev/null
+++ b/modelscope/models/nlp/veco/configuration_veco.py
@@ -0,0 +1,33 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Veco configuration, mainly copied from :class:`~transformers.configuration_xlm_roberta` """
+
+from transformers import RobertaConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class VecoConfig(RobertaConfig):
+    """
+    This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    model_type = 'veco'
diff --git a/modelscope/models/nlp/veco/modeling_veco.py b/modelscope/models/nlp/veco/modeling_veco.py
new file mode 100644
index 00000000..b519c236
--- /dev/null
+++ b/modelscope/models/nlp/veco/modeling_veco.py
@@ -0,0 +1,143 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
+
+from transformers import (RobertaForMaskedLM, RobertaForMultipleChoice,
+                          RobertaForQuestionAnswering,
+                          RobertaForSequenceClassification,
+                          RobertaForTokenClassification, RobertaModel)
+from transformers.file_utils import add_start_docstrings
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Fields
+from .configuration_veco import VecoConfig
+
+logger = logging.get_logger(__name__)
+
+VECO_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+VECO_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+"""
+
+
+@add_start_docstrings(
+    'The bare Veco Model transformer outputting raw hidden-states without any specific head on top.',
+    VECO_START_DOCSTRING,
+)
+class VecoModel(RobertaModel):
+    """
+    This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForSequenceClassification(RobertaForSequenceClassification):
+    """
+    This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model transformer with a masked language model head on top (a linear layer on top of the
+    pooled output).
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForMaskedLM(RobertaForMaskedLM):
+    """
+    This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForMultipleChoice(RobertaForMultipleChoice):
+    """
+    This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForTokenClassification(RobertaForTokenClassification):
+    """
+    This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+
+@add_start_docstrings(
+    """
+    Veco Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    VECO_START_DOCSTRING,
+)
+class VecoForQuestionAnswering(RobertaForQuestionAnswering):
+    """
+    This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
diff --git a/modelscope/models/nlp/veco/tokenization_veco.py b/modelscope/models/nlp/veco/tokenization_veco.py
new file mode 100644
index 00000000..21711456
--- /dev/null
+++ b/modelscope/models/nlp/veco/tokenization_veco.py
@@ -0,0 +1,321 @@
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta`"""
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = '▁'
+
+VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+
+class VecoTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method.
+            The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python)
+            can be used, among other things, to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ['input_ids', 'attention_mask']
+
+    def __init__(self,
+                 vocab_file,
+                 bos_token='<s>',
+                 eos_token='</s>',
+                 sep_token='</s>',
+                 cls_token='<s>',
+                 unk_token='<unk>',
+                 pad_token='<pad>',
+                 mask_token='<mask>',
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
+                 **kwargs) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(
+            mask_token, lstrip=True, rstrip=False) if isinstance(
+                mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {
+            '<s>': 0,
+            '<pad>': 1,
+            '</s>': 2,
+            '<unk>': 3
+        }
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.fairseq_tokens_to_ids['<mask>'] = len(
+            self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {
+            v: k
+            for k, v in self.fairseq_tokens_to_ids.items()
+        }
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['sp_model'] = None
+        state['sp_model_proto'] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, 'sp_model_kwargs'):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An Veco sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + (
+            [0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(
+            self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/models/nlp/veco/tokenization_veco_fast.py b/modelscope/models/nlp/veco/tokenization_veco_fast.py
new file mode 100644
index 00000000..3edae0e7
--- /dev/null
+++ b/modelscope/models/nlp/veco/tokenization_veco_fast.py
@@ -0,0 +1,213 @@
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Fast Tokenization classes for Veco. mainly copied from :module:`~transformers.tokenization_xlm_roberta_fast`"""
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+import transformers
+from transformers.file_utils import is_sentencepiece_available
+from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils import logger as logging
+
+if is_sentencepiece_available():
+    from .tokenization_veco import VecoTokenizer
+else:
+    VecoTokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'sentencepiece.bpe.model',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file': {},
+    'tokenizer_file': {},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+transformers.SLOW_TO_FAST_CONVERTERS[
+    'VecoTokenizer'] = transformers.SLOW_TO_FAST_CONVERTERS[
+        'XLMRobertaTokenizer']
+
+
+class VecoTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`].
+    Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ['input_ids', 'attention_mask']
+    slow_tokenizer_class = VecoTokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 bos_token='<s>',
+                 eos_token='</s>',
+                 sep_token='</s>',
+                 cls_token='<s>',
+                 unk_token='<unk>',
+                 pad_token='<pad>',
+                 mask_token='<mask>',
+                 **kwargs):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(
+            mask_token, lstrip=True, rstrip=False) if isinstance(
+                mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An Veco sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Veco does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+                'tokenizer.')
+
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory.')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 8174d054..f6896e4a 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -517,3 +517,10 @@ class MsDataset:
     def to_hf_dataset(self) -> Dataset:
         self._hf_ds.reset_format()
         return self._hf_ds
+
+    @staticmethod
+    def interleave_datasets(datasets: List[Any],
+                            probabilities: Optional[List[float]] = None,
+                            seed: Optional[int] = None):
+        from datasets import interleave_datasets
+        return interleave_datasets(datasets, probabilities, seed)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 0937e441..a82f6ed5 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -9,6 +9,7 @@ class OutputKeys(object):
     SCORES = 'scores'
     LABEL = 'label'
     LABELS = 'labels'
+    INPUT_IDS = 'input_ids'
     LABEL_POS = 'label_pos'
     POSES = 'poses'
     CAPTION = 'caption'
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index e6a35efc..1111f0d3 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -9,9 +9,8 @@ if TYPE_CHECKING:
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
-    from .nli_pipeline import NLIPipeline
-    from .sentence_similarity_pipeline import SentenceSimilarityPipeline
-    from .sentiment_classification_pipeline import SentimentClassificationPipeline
+    from .pair_sentence_classification_pipeline import PairSentenceClassificationPipeline
+    from .single_sentence_classification_pipeline import SingleSentenceClassificationPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
     from .text_generation_pipeline import TextGenerationPipeline
     from .translation_pipeline import TranslationPipeline
@@ -28,10 +27,10 @@ else:
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
-        'nli_pipeline': ['NLIPipeline'],
-        'sentence_similarity_pipeline': ['SentenceSimilarityPipeline'],
-        'sentiment_classification_pipeline':
-        ['SentimentClassificationPipeline'],
+        'single_sentence_classification_pipeline':
+        ['SingleSentenceClassificationPipeline'],
+        'pair_sentence_classification_pipeline':
+        ['PairSentenceClassificationPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 27c34817..e4affe40 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -5,11 +5,10 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp.masked_language import MaskedLanguageModelBase
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.preprocessors import FillMaskPreprocessor, Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -21,18 +20,18 @@ _type_map = {'veco': 'roberta', 'sbert': 'bert'}
 class FillMaskPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[MaskedLanguageModelBase, str],
-                 preprocessor: Optional[FillMaskPreprocessor] = None,
-                 first_sequence='sentense',
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 first_sequence='sentence',
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
 
         Args:
-            model (MaskedLanguageModelBase): a model instance
-            preprocessor (FillMaskPreprocessor): a preprocessor instance
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
         """
         fill_mask_model = model if isinstance(
-            model, MaskedLanguageModelBase) else Model.from_pretrained(model)
+            model, Model) else Model.from_pretrained(model)
 
         if preprocessor is None:
             preprocessor = FillMaskPreprocessor(
@@ -73,7 +72,7 @@ class FillMaskPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return self.model(inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
@@ -85,8 +84,8 @@ class FillMaskPipeline(Pipeline):
             Dict[str, str]: the prediction results
         """
         import numpy as np
-        logits = inputs['logits'].detach().cpu().numpy()
-        input_ids = inputs['input_ids'].detach().cpu().numpy()
+        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
+        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
         model_type = self.model.config.model_type
         process_type = model_type if model_type in self.mask_id else _type_map[
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 65334144..29c439fc 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -4,11 +4,10 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import TransformerCRFForNamedEntityRecognition
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NERPreprocessor
+from modelscope.preprocessors import NERPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['NamedEntityRecognitionPipeline']
@@ -20,13 +19,12 @@ __all__ = ['NamedEntityRecognitionPipeline']
 class NamedEntityRecognitionPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[TransformerCRFForNamedEntityRecognition, str],
-                 preprocessor: Optional[NERPreprocessor] = None,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
                  **kwargs):
 
         model = model if isinstance(model,
-                                    TransformerCRFForNamedEntityRecognition
-                                    ) else Model.from_pretrained(model)
+                                    Model) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = NERPreprocessor(model.model_dir)
         model.eval()
diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py
deleted file mode 100644
index 200f44e4..00000000
--- a/modelscope/pipelines/nlp/nli_pipeline.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import uuid
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import SbertForNLI
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NLIPreprocessor
-from modelscope.utils.constant import Tasks
-
-__all__ = ['NLIPipeline']
-
-
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-class NLIPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[SbertForNLI, str],
-                 preprocessor: NLIPreprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
-
-        Args:
-            model (SbertForNLI): a model instance
-            preprocessor (NLIPreprocessor): a preprocessor instance
-        """
-        assert isinstance(model, str) or isinstance(model, SbertForNLI), \
-            'model must be a single str or SbertForNLI'
-        model = model if isinstance(
-            model, SbertForNLI) else Model.from_pretrained(model)
-        if preprocessor is None:
-            preprocessor = NLIPreprocessor(
-                model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence)
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        assert len(model.id2label) > 0
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return super().forward(inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs['probabilities'][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
new file mode 100644
index 00000000..0804ec8c
--- /dev/null
+++ b/modelscope/pipelines/nlp/pair_sentence_classification_pipeline.py
@@ -0,0 +1,37 @@
+from typing import Union
+
+from modelscope.models.base import Model
+from ...metainfo import Pipelines
+from ...preprocessors import (PairSentenceClassificationPreprocessor,
+                              Preprocessor)
+from ...utils.constant import Tasks
+from ..builder import PIPELINES
+from .sequence_classification_pipeline_base import \
+    SequenceClassificationPipelineBase
+
+__all__ = ['PairSentenceClassificationPipeline']
+
+
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
+class PairSentenceClassificationPipeline(SequenceClassificationPipelineBase):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
+                 first_sequence='first_sequence',
+                 second_sequence='second_sequence',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp pair sentence classification pipeline for prediction
+
+        Args:
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
+        """
+        if preprocessor is None:
+            preprocessor = PairSentenceClassificationPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
deleted file mode 100644
index c09e2115..00000000
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import SbertForSentenceSimilarity
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Input, Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SentenceSimilarityPreprocessor
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SentenceSimilarityPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-class SentenceSimilarityPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: SentenceSimilarityPreprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
-
-        Args:
-            model (SbertForSentenceSimilarity): a model instance
-            preprocessor (SentenceSimilarityPreprocessor): a preprocessor instance
-        """
-        assert isinstance(model, str) or isinstance(model, SbertForSentenceSimilarity), \
-            'model must be a single str or SbertForSentenceSimilarity'
-        sc_model = model if isinstance(
-            model,
-            SbertForSentenceSimilarity) else Model.from_pretrained(model)
-        if preprocessor is None:
-            preprocessor = SentenceSimilarityPreprocessor(
-                sc_model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence)
-        sc_model.eval()
-        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
-
-        assert hasattr(self.model, 'id2label'), \
-            'id2label map should be initalizaed in init function.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return super().forward(inputs, **forward_params)
-
-    def postprocess(self, inputs: Dict[str, Any],
-                    **postprocess_params) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs['probabilities'][0]
-        num_classes = probs.shape[0]
-        top_indices = np.argpartition(probs, -num_classes)[-num_classes:]
-        cls_ids = top_indices[np.argsort(-probs[top_indices], axis=-1)]
-        probs = probs[cls_ids].tolist()
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-        b = 0
-        return {OutputKeys.SCORES: probs[b], OutputKeys.LABELS: cls_names[b]}
diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
deleted file mode 100644
index 8e57d77b..00000000
--- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp import SequenceClassificationModel
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import SentimentClassificationPreprocessor
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SentimentClassificationPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.sentiment_classification,
-    module_name=Pipelines.sentiment_classification)
-class SentimentClassificationPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[SequenceClassificationModel, str],
-                 preprocessor: SentimentClassificationPreprocessor = None,
-                 first_sequence='first_sequence',
-                 second_sequence='second_sequence',
-                 **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
-
-        Args:
-            model (SequenceClassificationModel): a model instance
-            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
-        """
-        assert isinstance(model, str) or isinstance(model, SequenceClassificationModel), \
-            'model must be a single str or SentimentClassification'
-        model = model if isinstance(
-            model,
-            SequenceClassificationModel) else Model.from_pretrained(model)
-        if preprocessor is None:
-            preprocessor = SentimentClassificationPreprocessor(
-                model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence)
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        assert len(model.id2label) > 0
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return super().forward(inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs['probabilities'][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.model.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
new file mode 100644
index 00000000..ad31bfbd
--- /dev/null
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline_base.py
@@ -0,0 +1,60 @@
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from ...preprocessors import Preprocessor
+from ..base import Pipeline
+
+
+class SequenceClassificationPipelineBase(Pipeline):
+
+    def __init__(self, model: Union[Model, str], preprocessor: Preprocessor,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (str or Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        assert preprocessor is not None
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+            topk (int): The topk probs to take
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        probs = inputs[OutputKeys.PROBABILITIES][0]
+        num_classes = probs.shape[0]
+        topk = min(topk, num_classes)
+        top_indices = np.argpartition(probs, -topk)[-topk:]
+        cls_ids = top_indices[np.argsort(probs[top_indices])]
+        probs = probs[cls_ids].tolist()
+
+        cls_names = [self.id2label[cid] for cid in cls_ids]
+        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
new file mode 100644
index 00000000..8e0b4fe0
--- /dev/null
+++ b/modelscope/pipelines/nlp/single_sentence_classification_pipeline.py
@@ -0,0 +1,35 @@
+from typing import Union
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...preprocessors import (Preprocessor,
+                              SingleSentenceClassificationPreprocessor)
+from ...utils.constant import Tasks
+from ..builder import PIPELINES
+from .sequence_classification_pipeline_base import \
+    SequenceClassificationPipelineBase
+
+__all__ = ['SingleSentenceClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentiment_classification,
+    module_name=Pipelines.sentiment_classification)
+class SingleSentenceClassificationPipeline(SequenceClassificationPipelineBase):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
+                 first_sequence='first_sequence',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp single sentence classification pipeline for prediction
+
+        Args:
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
+        """
+        if preprocessor is None:
+            preprocessor = SingleSentenceClassificationPreprocessor(
+                model.model_dir if isinstance(model, Model) else model,
+                first_sequence=first_sequence)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 85a81eba..287c98ff 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -3,7 +3,7 @@ from typing import Any, Dict, Optional, Union
 import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.base import TorchModel
+from modelscope.models.base import Model
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TextGenerationPreprocessor
@@ -17,7 +17,7 @@ __all__ = ['TextGenerationPipeline']
 class TextGenerationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[TorchModel, str],
+                 model: Union[Model, str],
                  preprocessor: Optional[TextGenerationPreprocessor] = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp text generation pipeline for prediction
@@ -26,8 +26,8 @@ class TextGenerationPipeline(Pipeline):
             model (PalmForTextGeneration): a model instance
             preprocessor (TextGenerationPreprocessor): a preprocessor instance
         """
-        model = model if isinstance(
-            model, TorchModel) else TorchModel.from_pretrained(model)
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = TextGenerationPreprocessor(
                 model.model_dir,
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index fdf9be64..dba3fe9f 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -4,11 +4,9 @@ from typing import Any, Dict
 import numpy as np
 import tensorflow as tf
 
-from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Pipelines
-from modelscope.models.nlp import CsanmtForTranslation
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 73d0c278..06e6a31c 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -4,11 +4,11 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForTokenClassification
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import TokenClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['WordSegmentationPipeline']
@@ -18,33 +18,35 @@ __all__ = ['WordSegmentationPipeline']
     Tasks.word_segmentation, module_name=Pipelines.word_segmentation)
 class WordSegmentationPipeline(Pipeline):
 
-    def __init__(
-            self,
-            model: Union[SbertForTokenClassification, str],
-            preprocessor: Optional[TokenClassificationPreprocessor] = None,
-            **kwargs):
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
         """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
 
         Args:
-            model (StructBertForTokenClassification): a model instance
-            preprocessor (TokenClassificationPreprocessor): a preprocessor instance
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
         """
-        model = model if isinstance(
-            model,
-            SbertForTokenClassification) else Model.from_pretrained(model)
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = TokenClassificationPreprocessor(model.model_dir)
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.tokenizer = preprocessor.tokenizer
-        self.config = model.config
-        assert len(self.config.id2label) > 0
-        self.id2label = self.config.id2label
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                          'as a parameter or make sure the preprocessor has the attribute.'
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return {
+                **self.model(inputs, **forward_params), OutputKeys.TEXT: text
+            }
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 642d4870..d0dd2336 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -5,11 +5,11 @@ from scipy.special import softmax
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForZeroShotClassification
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import ZeroShotClassificationPreprocessor
+from modelscope.preprocessors import (Preprocessor,
+                                      ZeroShotClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ZeroShotClassificationPipeline']
@@ -21,19 +21,18 @@ __all__ = ['ZeroShotClassificationPipeline']
 class ZeroShotClassificationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[SbertForZeroShotClassification, str],
-                 preprocessor: ZeroShotClassificationPreprocessor = None,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+        """use `model` and `preprocessor` to create a nlp zero-shot text classification pipeline for prediction
         Args:
-            model (SbertForZeroShotClassification): a model instance
-            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
+            model (Model): a model instance
+            preprocessor (Preprocessor): a preprocessor instance
         """
-        assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
-            'model must be a single str or SbertForZeroShotClassification'
-        model = model if isinstance(
-            model,
-            SbertForZeroShotClassification) else Model.from_pretrained(model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or Model'
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
         self.entailment_id = 0
         self.contradiction_id = 2
         if preprocessor is None:
@@ -58,7 +57,7 @@ class ZeroShotClassificationPipeline(Pipeline):
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return self.model(inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -70,7 +69,7 @@ class ZeroShotClassificationPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the prediction results
         """
-        logits = inputs['logits']
+        logits = inputs[OutputKeys.LOGITS]
         if multi_label or len(candidate_labels) == 1:
             logits = logits[..., [self.contradiction_id, self.entailment_id]]
             scores = softmax(logits, axis=-1)[..., 1]
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 9d991146..c73a6c4f 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -18,11 +18,11 @@ if TYPE_CHECKING:
                               MPlugVisualQuestionAnsweringPreprocessor)
     from .nlp import (Tokenize, SequenceClassificationPreprocessor,
                       TextGenerationPreprocessor,
-                      TokenClassificationPreprocessor, NLIPreprocessor,
-                      SentimentClassificationPreprocessor,
-                      SentenceSimilarityPreprocessor, FillMaskPreprocessor,
-                      ZeroShotClassificationPreprocessor, NERPreprocessor,
-                      TextErrorCorrectionPreprocessor)
+                      TokenClassificationPreprocessor,
+                      SingleSentenceClassificationPreprocessor,
+                      PairSentenceClassificationPreprocessor,
+                      FillMaskPreprocessor, ZeroShotClassificationPreprocessor,
+                      NERPreprocessor, TextErrorCorrectionPreprocessor)
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
                         DialogStateTrackingPreprocessor)
@@ -46,8 +46,8 @@ else:
         'nlp': [
             'Tokenize', 'SequenceClassificationPreprocessor',
             'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-            'NLIPreprocessor', 'SentimentClassificationPreprocessor',
-            'SentenceSimilarityPreprocessor', 'FillMaskPreprocessor',
+            'SingleSentenceClassificationPreprocessor',
+            'PairSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
             'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
             'TextErrorCorrectionPreprocessor'
         ],
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index d0142693..6360a907 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 
@@ -10,6 +10,8 @@ class Preprocessor(ABC):
 
     def __init__(self, *args, **kwargs):
         self._mode = ModeKeys.INFERENCE
+        self.device = int(
+            os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None
         pass
 
     @abstractmethod
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index a0a7a5b5..f0951f38 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -2,14 +2,14 @@
 
 import os.path as osp
 import uuid
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 from transformers import AutoTokenizer
 
-from modelscope.metainfo import Preprocessors
-from modelscope.models import Model
+from modelscope.metainfo import Models, Preprocessors
+from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Fields, InputFields, ModeKeys
-from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
@@ -17,8 +17,8 @@ from .builder import PREPROCESSORS
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
-    'NLIPreprocessor', 'SentimentClassificationPreprocessor',
-    'FillMaskPreprocessor', 'SentenceSimilarityPreprocessor',
+    'PairSentenceClassificationPreprocessor',
+    'SingleSentenceClassificationPreprocessor', 'FillMaskPreprocessor',
     'ZeroShotClassificationPreprocessor', 'NERPreprocessor',
     'TextErrorCorrectionPreprocessor'
 ]
@@ -38,99 +38,6 @@ class Tokenize(Preprocessor):
         return data
 
 
-class NLPPreprocessorBase(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.tokenize_kwargs = kwargs
-        self.tokenizer = self.build_tokenizer(model_dir)
-        self.label2id = parse_label_mapping(self.model_dir)
-
-    def build_tokenizer(self, model_dir):
-        from sofa import SbertTokenizer
-        return SbertTokenizer.from_pretrained(model_dir)
-
-    @type_assert(object, object)
-    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b = None, None
-        if isinstance(data, str):
-            text_a = data
-        elif isinstance(data, tuple):
-            assert len(data) == 2
-            text_a, text_b = data
-        elif isinstance(data, dict):
-            text_a = data.get(self.first_sequence)
-            text_b = data.get(self.second_sequence, None)
-
-        rst = self.tokenizer(text_a, text_b, **self.tokenize_kwargs)
-        if self._mode == ModeKeys.TRAIN:
-            rst = {k: v.squeeze() for k, v in rst.items()}
-            if self.label2id is not None and 'label' in data:
-                rst['label'] = self.label2id[str(data['label'])]
-        return rst
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
-class NLIPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = False
-        kwargs['return_tensors'] = 'pt'
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SentimentClassificationPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = 'max_length'
-        kwargs['return_tensors'] = 'pt'
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-class SentenceSimilarityPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = False if 'padding' not in kwargs else kwargs[
-            'padding']
-        kwargs['return_tensors'] = 'pt'
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
-
-
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 class SequenceClassificationPreprocessor(Preprocessor):
@@ -197,32 +104,193 @@ class SequenceClassificationPreprocessor(Preprocessor):
         return rst
 
 
+class NLPTokenizerPreprocessorBase(Preprocessor):
+
+    def __init__(self, model_dir: str, pair: bool, mode: str, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(**kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
+        self.pair = pair
+        self._mode = mode
+        self.label = kwargs.pop('label', OutputKeys.LABEL)
+        self.label2id = None
+        if 'label2id' in kwargs:
+            self.label2id = kwargs.pop('label2id')
+        if self.label2id is None:
+            self.label2id = parse_label_mapping(self.model_dir)
+
+        self.tokenize_kwargs = kwargs
+        self.tokenizer = self.build_tokenizer(model_dir)
+
+    @property
+    def id2label(self):
+        if self.label2id is not None:
+            return {id: label for label, id in self.label2id.items()}
+        return None
+
+    def build_tokenizer(self, model_dir):
+        model_type = get_model_type(model_dir)
+        if model_type in (Models.structbert, Models.gpt3, Models.palm):
+            from modelscope.models.nlp.structbert import SbertTokenizerFast
+            return SbertTokenizerFast.from_pretrained(model_dir)
+        elif model_type == Models.veco:
+            from modelscope.models.nlp.veco import VecoTokenizerFast
+            return VecoTokenizerFast.from_pretrained(model_dir)
+        else:
+            return AutoTokenizer.from_pretrained(model_dir)
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        self.labels_to_id(labels, output)
+        return output
+
+    def parse_text_and_label(self, data):
+        text_a, text_b, labels = None, None, None
+        if isinstance(data, str):
+            text_a = data
+        elif isinstance(data, tuple) or isinstance(data, list):
+            if len(data) == 3:
+                text_a, text_b, labels = data
+            elif len(data) == 2:
+                if self.pair:
+                    text_a, text_b = data
+                else:
+                    text_a, labels = data
+        elif isinstance(data, dict):
+            text_a = data.get(self.first_sequence)
+            text_b = data.get(self.second_sequence)
+            labels = data.get(self.label)
+
+        return text_a, text_b, labels
+
+    def labels_to_id(self, labels, output):
+
+        def label_can_be_mapped(label):
+            return isinstance(label, str) or isinstance(label, int)
+
+        if labels is not None:
+            if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
+                    and self.label2id is not None:
+                output[OutputKeys.LABEL] = [
+                    self.label2id[str(label)] for label in labels
+                ]
+            elif label_can_be_mapped(labels) and self.label2id is not None:
+                output[OutputKeys.LABEL] = self.label2id[str(labels)]
+            else:
+                output[OutputKeys.LABEL] = labels
+
+
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name='bert-seq-cls-tokenizer-finetune')
-class SentenceSimilarityFinetunePreprocessor(SentenceSimilarityPreprocessor):
-    """Sentence similarity preprocessor in the finetune scenario
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
+class PairSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == 'inference' else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=True, mode=mode, **kwargs)
 
-    Mainly added the label mapping procedure.
-    """
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['padding'] = 'max_length'
-        super().__init__(model_dir, *args, **kwargs)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SingleSentenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == 'inference' else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if isinstance(data, dict):
+            data = data.get(self.first_sequence)
+
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            truncation_strategy='only_first',
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
+        return features
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
-class TextGenerationPreprocessor(NLPPreprocessorBase):
+class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
 
-    def __init__(self, model_dir: str, tokenizer=None, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
         self.tokenizer = self.build_tokenizer(
             model_dir) if tokenizer is None else tokenizer
-        kwargs['truncation'] = True
-        kwargs['padding'] = True
-        kwargs['return_tensors'] = 'pt'
-        kwargs['return_token_type_ids'] = False
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', True)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, *args, **kwargs)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
     @staticmethod
     def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
@@ -240,19 +308,13 @@ class TextGenerationPreprocessor(NLPPreprocessorBase):
                 roberta_tokenizer_dir, do_lower_case=False)
         return super().build_tokenizer(model_dir)
 
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name='palm-text-gen-tokenizer-finetune')
-class TextGenerationFinetunePreprocessor(TextGenerationPreprocessor):
-
-    @type_assert(object, dict)
-    def __call__(self, data: dict) -> Dict[str, Any]:
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        if self._mode == 'inference':
+            return super().__call__(data)
         src_txt = data['src_txt']
         tgt_txt = data['tgt_txt']
         src_rst = super().__call__(src_txt)
         tgt_rst = super().__call__(tgt_txt)
-        src_rst = {k: v.squeeze() for k, v in src_rst.items()}
-        tgt_rst = {k: v.squeeze() for k, v in tgt_rst.items()}
 
         return {
             'src': src_rst['input_ids'],
@@ -261,87 +323,69 @@ class TextGenerationFinetunePreprocessor(TextGenerationPreprocessor):
         }
 
 
-@PREPROCESSORS.register_module(Fields.nlp)
-class FillMaskPreprocessor(NLPPreprocessorBase):
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+class FillMaskPreprocessor(NLPTokenizerPreprocessorBase):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
-        kwargs['truncation'] = True
-        kwargs['padding'] = 'max_length'
-        kwargs['return_tensors'] = 'pt'
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = True
-        super().__init__(model_dir, *args, **kwargs)
-
-    def build_tokenizer(self, model_dir):
-        from modelscope.utils.hub import get_model_type
-        model_type = get_model_type(model_dir)
-        if model_type in ['sbert', 'structbert', 'bert']:
-            from sofa import SbertTokenizer
-            return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
-        elif model_type == 'veco':
-            from sofa import VecoTokenizer
-            return VecoTokenizer.from_pretrained(model_dir, use_fast=False)
-        else:
-            # TODO Only support veco & sbert
-            raise RuntimeError(f'Unsupported model type: {model_type}')
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
-class TokenClassificationPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super().__init__(model_dir, *args, **kwargs)
-
-    @type_assert(object, str)
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
-        """process the raw input data
+    Fields.nlp,
+    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
+class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
 
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        if isinstance(data, dict):
-            data = data[self.first_sequence]
-        text = data.replace(' ', '').strip()
-        tokens = []
-        for token in text:
-            token = self.tokenizer.tokenize(token)
-            tokens.extend(token)
-        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-        input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
-        attention_mask = [1] * len(input_ids)
-        token_type_ids = [0] * len(input_ids)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.label = kwargs.pop('label', OutputKeys.LABELS)
+
+    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
+        data = data.split(' ')
+        data = list(filter(lambda x: len(x) > 0, data))
+
+        def produce_train_sample(words):
+            chars = []
+            labels = []
+            for word in words:
+                chars.extend(list(word))
+                if len(word) == 1:
+                    labels.append('S-CWS')
+                else:
+                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
+                                  + ['E-CWS'])
+            assert len(chars) == len(labels)
+            return chars, labels
+
+        chars, labels = produce_train_sample(data)
         return {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'token_type_ids': token_type_ids
+            self.first_sequence: chars,
+            self.label: labels,
         }
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
-class ZeroShotClassificationPreprocessor(NLPPreprocessorBase):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
 
-        Args:
-            model_dir (str): model path
-        """
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, *args, **kwargs)
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['is_split_into_words'] = kwargs.pop(
+            'is_split_into_words',
+            False if mode == ModeKeys.INFERENCE else True)
+        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
+        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
 
-    @type_assert(object, str)
-    def __call__(self, data, hypothesis_template: str,
-                 candidate_labels: list) -> Dict[str, Any]:
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
@@ -352,20 +396,74 @@ class ZeroShotClassificationPreprocessor(NLPPreprocessorBase):
         Returns:
             Dict[str, Any]: the preprocessed data
         """
-        if isinstance(data, dict):
-            data = data.get(self.first_sequence)
 
-        pairs = [[data, hypothesis_template.format(label)]
-                 for label in candidate_labels]
-
-        features = self.tokenizer(
-            pairs,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            return_tensors='pt',
-            truncation_strategy='only_first')
-        return features
+        # preprocess the data for the model input
+        # if isinstance(data, dict):
+        #     data = data[self.first_sequence]
+        # text = data.replace(' ', '').strip()
+        # tokens = []
+        # for token in text:
+        #     token = self.tokenizer.tokenize(token)
+        #     tokens.extend(token)
+        # input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        # input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
+        # attention_mask = [1] * len(input_ids)
+        # token_type_ids = [0] * len(input_ids)
+
+        # new code to deal with labels
+        # tokenized_inputs = self.tokenizer(data, truncation=True, is_split_into_words=True)
+
+        text_a = None
+        labels_list = None
+        if isinstance(data, str):
+            text_a = data
+        elif isinstance(data, dict):
+            text_a = data.get(self.first_sequence)
+            labels_list = data.get(self.label)
+        tokenized_inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        if labels_list is not None:
+            assert self.label2id is not None
+            # Map that sends B-Xxx label to its I-Xxx counterpart
+            b_to_i_label = []
+            label_enumerate_values = [
+                k for k, v in sorted(
+                    self.label2id.items(), key=lambda item: item[1])
+            ]
+            for idx, label in enumerate(label_enumerate_values):
+                if label.startswith('B-') and label.replace(
+                        'B-', 'I-') in label_enumerate_values:
+                    b_to_i_label.append(
+                        label_enumerate_values.index(
+                            label.replace('B-', 'I-')))
+                else:
+                    b_to_i_label.append(idx)
+
+            label_row = [self.label2id[lb] for lb in labels_list]
+            word_ids = tokenized_inputs.word_ids()
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                if word_idx is None:
+                    label_ids.append(-100)
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_row[word_idx])
+                else:
+                    if self.label_all_tokens:
+                        label_ids.append(b_to_i_label[label_row[word_idx]])
+                    else:
+                        label_ids.append(-100)
+                previous_word_idx = word_idx
+            labels = label_ids
+            tokenized_inputs['labels'] = labels
+            # new code end
+
+        if self._mode == ModeKeys.INFERENCE:
+            tokenized_inputs[OutputKeys.TEXT] = text_a
+        return tokenized_inputs
 
 
 @PREPROCESSORS.register_module(
diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
index 80036ed1..038ab09b 100644
--- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
@@ -24,7 +24,7 @@ class DialogStateTrackingPreprocessor(Preprocessor):
         """
         super().__init__(*args, **kwargs)
 
-        from sofa.models.space import SpaceConfig, SpaceTokenizer
+        from modelscope.models.nlp.space import SpaceConfig, SpaceTokenizer
         self.model_dir: str = model_dir
         self.config = SpaceConfig.from_pretrained(self.model_dir)
         self.tokenizer = SpaceTokenizer.from_pretrained(self.model_dir)
diff --git a/modelscope/task_datasets/__init__.py b/modelscope/task_datasets/__init__.py
index 5f0d9b1e..93e01cb5 100644
--- a/modelscope/task_datasets/__init__.py
+++ b/modelscope/task_datasets/__init__.py
@@ -7,12 +7,14 @@ if TYPE_CHECKING:
     from .base import TaskDataset
     from .builder import TASK_DATASETS, build_task_dataset
     from .torch_base_dataset import TorchTaskDataset
+    from .veco_dataset import VecoDataset
 
 else:
     _import_structure = {
         'base': ['TaskDataset'],
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
+        'veco_dataset': ['VecoDataset'],
     }
     import sys
 
diff --git a/modelscope/task_datasets/base.py b/modelscope/task_datasets/base.py
index a4104ced..39b791b1 100644
--- a/modelscope/task_datasets/base.py
+++ b/modelscope/task_datasets/base.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from abc import ABC, abstractmethod
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, Union
 
 
 class TaskDataset(ABC):
@@ -8,7 +8,7 @@ class TaskDataset(ABC):
     """
 
     def __init__(self,
-                 datasets: Tuple[Any, List[Any]],
+                 datasets: Union[Any, List[Any]],
                  mode,
                  preprocessor=None,
                  **kwargs):
@@ -18,7 +18,7 @@ class TaskDataset(ABC):
         self._inner_dataset = self.prepare_dataset(datasets)
 
     @abstractmethod
-    def prepare_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any:
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
         """Prepare a dataset.
 
         User can process the input datasets in a whole dataset perspective.
diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/task_datasets/torch_base_dataset.py
index 5ec9209e..014e4faa 100644
--- a/modelscope/task_datasets/torch_base_dataset.py
+++ b/modelscope/task_datasets/torch_base_dataset.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, Union
 
 from torch.utils.data import ConcatDataset, Dataset
 
@@ -14,7 +14,7 @@ class TorchTaskDataset(TaskDataset, Dataset):
     """
 
     def __init__(self,
-                 datasets: Tuple[Any, List[Any]],
+                 datasets: Union[Any, List[Any]],
                  mode,
                  preprocessor=None,
                  **kwargs):
@@ -26,7 +26,7 @@ class TorchTaskDataset(TaskDataset, Dataset):
     def __len__(self):
         return len(self._inner_dataset)
 
-    def prepare_dataset(self, datasets: Tuple[Any, List[Any]]) -> Any:
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
         """Prepare a dataset.
 
         User can process the input datasets in a whole dataset perspective.
diff --git a/modelscope/task_datasets/veco_dataset.py b/modelscope/task_datasets/veco_dataset.py
new file mode 100644
index 00000000..df7c6483
--- /dev/null
+++ b/modelscope/task_datasets/veco_dataset.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, List, Union
+
+import numpy as np
+from datasets import Dataset, IterableDataset, concatenate_datasets
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import Tasks
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+
+@TASK_DATASETS.register_module(module_name=Models.veco, group_key=Tasks.nli)
+class VecoDataset(TorchTaskDataset):
+
+    def __init__(self,
+                 datasets: Union[Any, List[Any]],
+                 mode,
+                 preprocessor=None,
+                 **kwargs):
+        self.seed = kwargs.get('seed', 42)
+        self.permutation = None
+        self.datasets = None
+        super().__init__(datasets, mode, preprocessor, **kwargs)
+
+    def switch_dataset(self, idx):
+        """Switch dataset in evaluation.
+
+        Veco evaluates dataset one by one.
+
+        Args:
+            idx: The index of the dataset
+        """
+        if self.mode == 'train':
+            raise ValueError(
+                'Only support switch dataset in the evaluation loop')
+        if idx >= len(self.datasets):
+            raise ValueError(
+                'Index is bigger than the number of the datasets.')
+        self._inner_dataset = self.datasets[idx]
+
+    def __getitem__(self, item):
+        if self.permutation is not None:
+            item = self.permutation[item]
+        return super().__getitem__(item)
+
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
+        """Compose all the datasets.
+
+        If the mode is 'train', all datasets will be mixed together, if the mode is 'eval',
+        the datasets will be kept and returns the first one.
+
+        Args:
+            datasets: The datasets to be composed.
+
+        Returns: The final dataset.
+        """
+        if not isinstance(datasets, (list, tuple)):
+            datasets = [datasets]
+        if self.mode == 'train':
+            if len(datasets) == 1:
+                return datasets[0]
+            elif all([
+                    isinstance(dataset, (Dataset, IterableDataset))
+                    for dataset in datasets
+            ]):
+                dataset = concatenate_datasets(list(datasets))
+                return dataset.shuffle(seed=self.seed)
+            else:
+                generator = np.random.default_rng(self.seed)
+                _len = sum([len(dataset) for dataset in datasets])
+                self.permutation = generator.permutation(_len)
+            return super().prepare_dataset(datasets)
+        else:
+            self.datasets = datasets
+            return self.datasets[0]
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 350bab61..d802fd8b 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -4,4 +4,5 @@ from .cv import (ImageInstanceSegmentationTrainer,
                  ImagePortraitEnhancementTrainer)
 from .multi_modal import CLIPTrainer
 from .nlp import SequenceClassificationTrainer
+from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
 from .trainer import EpochBasedTrainer
diff --git a/modelscope/trainers/hooks/evaluation_hook.py b/modelscope/trainers/hooks/evaluation_hook.py
index aea27f2f..80d8c03c 100644
--- a/modelscope/trainers/hooks/evaluation_hook.py
+++ b/modelscope/trainers/hooks/evaluation_hook.py
@@ -32,6 +32,7 @@ class EvaluationHook(Hook):
     def do_evaluate(self, trainer):
         """Evaluate the results."""
         eval_res = trainer.evaluate()
+        trainer.data_loader = trainer.train_dataloader
         for name, val in eval_res.items():
             trainer.log_buffer.output[name] = val
 
diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py
index cf3a16e7..9a5de392 100644
--- a/modelscope/trainers/hooks/lr_scheduler_hook.py
+++ b/modelscope/trainers/hooks/lr_scheduler_hook.py
@@ -21,9 +21,6 @@ class LrSchedulerHook(Hook):
     def __init__(self, by_epoch=True, warmup=None) -> None:
         super().__init__()
         self.by_epoch = by_epoch
-        if not self.by_epoch:
-            raise ValueError('We only support ``by_epoch=True`` now!')
-
         self.warmup = warmup
         self.warmup_lr_scheduler = None
 
@@ -49,6 +46,11 @@ class LrSchedulerHook(Hook):
         return lr
 
     def before_train_iter(self, trainer):
+        if not self.by_epoch:
+            if self.warmup_lr_scheduler is not None:
+                self.warmup_lr_scheduler.step()
+            else:
+                trainer.lr_scheduler.step()
         trainer.log_buffer.output[LogKeys.LR] = self._get_log_lr(trainer)
 
     def before_train_epoch(self, trainer):
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
new file mode 100644
index 00000000..c8121db6
--- /dev/null
+++ b/modelscope/trainers/nlp_trainer.py
@@ -0,0 +1,192 @@
+import os
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metrics.builder import build_metric
+from modelscope.models.base import Model, TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
+                                       ModelFile, Tasks)
+from .base import TRAINERS
+from .trainer import EpochBasedTrainer
+
+
+@TRAINERS.register_module(module_name='NlpEpochBasedTrainer')
+class NlpEpochBasedTrainer(EpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+        """Add code to adapt with nlp models.
+
+        Args:
+            cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
+        """
+
+        if isinstance(model, str):
+            if os.path.exists(model):
+                model_dir = model if os.path.isdir(model) else os.path.dirname(
+                    model)
+            else:
+                model_dir = snapshot_download(model, revision=model_revision)
+            cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
+            model_dir = os.path.dirname(cfg_file)
+
+        self.cfg_modify_fn = cfg_modify_fn
+        self.cfg = self.rebuild_config(Config.from_file(cfg_file))
+        try:
+            labels = self.cfg.dataset.train.labels
+        except AttributeError:
+            labels = None
+
+        self.label2id = None
+        self.num_labels = None
+        if labels is not None and len(labels) > 0:
+            self.label2id = {label: idx for idx, label in enumerate(labels)}
+            self.id2label = {idx: label for idx, label in enumerate(labels)}
+            self.num_labels = len(labels)
+
+        def build_dataset_keys(cfg):
+            if cfg is not None:
+                input_keys = {
+                    'first_sequence': getattr(cfg, 'first_sequence', None),
+                    'second_sequence': getattr(cfg, 'second_sequence', None),
+                    'label': getattr(cfg, 'label', None),
+                }
+            else:
+                input_keys = {}
+
+            return {k: v for k, v in input_keys.items() if v is not None}
+
+        self.train_keys = build_dataset_keys(
+            self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
+            and hasattr(self.cfg.dataset, 'train') else None)
+        # TODO eval may has special keys, which is now not supported.
+        # because there is only one preprocessor in the trainer, and it only supports one group of keys.
+        self.eval_keys = self.train_keys
+
+        super().__init__(
+            model=model_dir,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            model_revision=model_revision,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            **kwargs)
+
+    def rebuild_config(self, cfg: Config):
+        if self.cfg_modify_fn is not None:
+            return self.cfg_modify_fn(cfg)
+        return cfg
+
+    def build_model(self) -> Union[nn.Module, TorchModel]:
+        """ Instantiate a pytorch model and return.
+
+        By default, we will create a model using config from configuration file. You can
+        override this method in a subclass.
+
+        """
+        model_args = {} if self.num_labels is None else {
+            'num_labels': self.num_labels
+        }
+        model = Model.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, **model_args)
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    def build_preprocessor(self) -> Preprocessor:
+        """Build the preprocessor.
+
+        User can override this method to implement custom logits.
+
+        Returns: The preprocessor instance.
+
+        """
+        model_args = {} if self.label2id is None else {
+            'label2id': self.label2id
+        }
+        cfg = ConfigDict({
+            **getattr(self.cfg, 'preprocessor'),
+            'model_dir':
+            self.model_dir,
+            **model_args,
+            'mode':
+            ModeKeys.TRAIN,
+            **self.train_keys,
+        })
+        return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
+
+
+@TRAINERS.register_module(module_name='VecoTrainer')
+class VecoTrainer(NlpEpochBasedTrainer):
+
+    def evaluate(self, checkpoint_path=None):
+        """Veco evaluates the datasets one by one.
+
+        """
+        from modelscope.task_datasets import VecoDataset
+        self.model.eval()
+        self._mode = ModeKeys.EVAL
+        metric_values = {}
+
+        if self.eval_dataset is None:
+            val_data = self.cfg.dataset.val
+            self.eval_dataset = self.build_dataset(
+                val_data, mode=ModeKeys.EVAL)
+
+        idx = 0
+        dataset_cnt = 1
+        if isinstance(self.eval_dataset, VecoDataset):
+            self.eval_dataset.switch_dataset(idx)
+            dataset_cnt = len(self.eval_dataset.datasets)
+
+        while True:
+            self.eval_dataloader = self._build_dataloader_with_dataset(
+                self.eval_dataset, **self.cfg.evaluation.get('dataloader', {}))
+            self.data_loader = self.eval_dataloader
+
+            metric_classes = [
+                build_metric(metric, default_args={'trainer': self})
+                for metric in self.metrics
+            ]
+            self.evaluation_loop(self.eval_dataloader, checkpoint_path,
+                                 metric_classes)
+
+            for m_idx, metric_cls in enumerate(metric_classes):
+                if f'eval_dataset[{idx}]' not in metric_values:
+                    metric_values[f'eval_dataset[{idx}]'] = {}
+                metric_values[f'eval_dataset[{idx}]'][
+                    self.metrics[m_idx]] = metric_cls.evaluate()
+
+            idx += 1
+            if idx < dataset_cnt:
+                self.eval_dataset.switch_dataset(idx)
+            else:
+                break
+
+        return metric_values
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index e83654a2..c5574f32 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -22,7 +22,8 @@ from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors import build_preprocessor
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.task_datasets import TorchTaskDataset, build_task_dataset
+from modelscope.task_datasets.builder import build_task_dataset
+from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
@@ -30,12 +31,12 @@ from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
                                        ModelFile, Tasks, TrainerStages)
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.tensor_utils import torch_default_data_collator
 from modelscope.utils.torch_utils import (broadcast, create_device,
                                           get_dist_info, init_dist)
-from modelscope.utils.utils import if_func_receive_dict_inputs
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -87,6 +88,7 @@ class EpochBasedTrainer(BaseTrainer):
                                                                         None),
             model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
             **kwargs):
+
         if isinstance(model, str):
             if os.path.exists(model):
                 self.model_dir = model if os.path.isdir(
@@ -108,9 +110,9 @@ class EpochBasedTrainer(BaseTrainer):
             self.model = model
 
         super().__init__(cfg_file, arg_parse_fn)
-
         # add default config
         self.cfg.merge_from_dict(self._get_default_config(), force=False)
+        self.cfg = self.rebuild_config(self.cfg)
 
         if 'work_dir' in kwargs:
             self.work_dir = kwargs['work_dir']
@@ -130,9 +132,9 @@ class EpochBasedTrainer(BaseTrainer):
         self.device = create_device(device_name == 'cpu')
 
         self.train_dataset = self.to_task_dataset(
-            train_dataset, mode='train', preprocessor=self.preprocessor)
+            train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
         self.eval_dataset = self.to_task_dataset(
-            eval_dataset, mode='eval', preprocessor=self.preprocessor)
+            eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
 
         self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
         self.metrics = self.get_metrics()
@@ -168,6 +170,14 @@ class EpochBasedTrainer(BaseTrainer):
             if not is_parallel(self.model) and self._dist:
                 self.model = self.to_parallel(self.model)
 
+    def rebuild_config(self, cfg: Config):
+        """A method used to rebuild the config, any subclass can override this method.
+
+        Returns: The rebuilt config
+
+        """
+        return cfg
+
     @property
     def mode(self):
         return self._mode
@@ -203,7 +213,7 @@ class EpochBasedTrainer(BaseTrainer):
         return self._max_epochs * len(self.data_loader)
 
     def to_task_dataset(self,
-                        datasets: Tuple[Dataset, List[Dataset]],
+                        datasets: Union[Dataset, List[Dataset]],
                         mode: str,
                         preprocessor: Optional[Preprocessor] = None):
         """Build the task specific dataset processor for this trainer.
@@ -229,17 +239,13 @@ class EpochBasedTrainer(BaseTrainer):
                 cfg = ConfigDict(
                     type=self.cfg.task, mode=mode, datasets=datasets)
                 return build_task_dataset(cfg, self.cfg.task)
-            elif isinstance(datasets,
-                            Dataset) or (isinstance(datasets, List)
-                                         and isinstance(datasets[0], Dataset)):
+            else:
                 cfg = ConfigDict(
-                    type=self.cfg.model.type, mode=mode, datasets=datasets)
+                    type=self.cfg.model.type,
+                    mode=mode,
+                    datasets=datasets,
+                    preprocessor=preprocessor)
                 return build_task_dataset(cfg, self.cfg.task)
-            else:
-                raise ValueError(
-                    f'invalid datasets type: {type(datasets)}, '
-                    f'expected  `MsDataset`, `torch.utils.data.Dataset` or list of them.'
-                )
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
                 return TorchTaskDataset(
@@ -262,8 +268,11 @@ class EpochBasedTrainer(BaseTrainer):
         # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
         # when they are different ones in training and evaluation
         cfg = ConfigDict({
-            **getattr(self.cfg, 'preprocessor'), 'model_dir':
-            self.model_dir
+            **getattr(self.cfg, 'preprocessor'),
+            'model_dir':
+            self.model_dir,
+            'mode':
+            ModeKeys.TRAIN,
         })
         return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
 
@@ -324,6 +333,8 @@ class EpochBasedTrainer(BaseTrainer):
                 **self.cfg.evaluation.get('dataloader', {}))
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
+        for m in metric_classes:
+            m.trainer = self
         metric_values = self.evaluation_loop(self.eval_dataloader,
                                              checkpoint_path, metric_classes)
 
@@ -338,10 +349,9 @@ class EpochBasedTrainer(BaseTrainer):
         """ Instantiate a pytorch model and return.
 
         By default, we will create a model using config from configuration file. You can
-        subclass and override this method in a subclass.
+        override this method in a subclass.
 
         """
-        # TODO temp implementation, waiting for @zhangzhicheng
         model = Model.from_pretrained(self.model_dir)
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             return model.model
@@ -412,9 +422,8 @@ class EpochBasedTrainer(BaseTrainer):
         self._mode = ModeKeys.TRAIN
         inputs = self.collate_fn(inputs)
         # call model forward but not __call__ to skip postprocess
-        if isinstance(
-                inputs,
-                Mapping) and not if_func_receive_dict_inputs(model.forward):
+        if isinstance(inputs,
+                      Mapping) and not func_receive_dict_inputs(model.forward):
             train_outputs = model.forward(**inputs)
         else:
             train_outputs = model.forward(inputs)
@@ -495,7 +504,7 @@ class EpochBasedTrainer(BaseTrainer):
         if self.eval_dataset is None:
             val_data = self.cfg.dataset.val
             self.eval_dataset = self.build_dataset(
-                val_data, mode=ModeKeys.TRAIN)
+                val_data, mode=ModeKeys.EVAL)
 
         batch_size = self.cfg.evaluation.batch_size
         workers = self.cfg.evaluation.workers
@@ -523,7 +532,8 @@ class EpochBasedTrainer(BaseTrainer):
         )
         torch_dataset = dataset.to_torch_dataset(
             preprocessors=self.preprocessor, )
-        return torch_dataset
+        dataset = self.to_task_dataset(torch_dataset, mode)
+        return dataset
 
     def create_optimizer_and_scheduler(self):
         """ Create optimizer and lr scheduler
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index c30d1d15..a90a58b6 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -10,9 +10,9 @@ import torch
 from torch import distributed as dist
 from tqdm import tqdm
 
+from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
                                           make_tmp_dir)
-from modelscope.utils.utils import if_func_receive_dict_inputs
 
 
 def single_gpu_test(model,
@@ -37,18 +37,19 @@ def single_gpu_test(model,
             if data_collate_fn is not None:
                 data = data_collate_fn(data)
             with torch.no_grad():
-                if isinstance(data,
-                              Mapping) and not if_func_receive_dict_inputs(
-                                  model.forward):
-
-                    result = model(**data)
+                if isinstance(data, Mapping) and not func_receive_dict_inputs(
+                        model.forward):
+                    result = model.forward(**data)
                 else:
-                    result = model(data)
+                    result = model.forward(data)
             if metric_classes is not None:
                 for metric_cls in metric_classes:
                     metric_cls.add(result, data)
 
-            batch_size = len(result)
+            if isinstance(data, dict):
+                batch_size = len(next(iter(data.values())))
+            else:
+                batch_size = len(data)
             for _ in range(batch_size):
                 pbar.update()
 
@@ -101,16 +102,18 @@ def multi_gpu_test(model,
                 data = data_collate_fn(data)
             data_list.append(data)
             with torch.no_grad():
-                if isinstance(data,
-                              Mapping) and not if_func_receive_dict_inputs(
-                                  model.forward):
-                    result = model(**data)
+                if isinstance(data, Mapping) and not func_receive_dict_inputs(
+                        model.forward):
+                    result = model.forward(**data)
                 else:
-                    result = model(data)
+                    result = model.forward(data)
             results.append(result)
 
             if rank == 0:
-                batch_size = len(result)
+                if isinstance(data, dict):
+                    batch_size = len(next(iter(data.values())))
+                else:
+                    batch_size = len(data)
                 batch_size_all = batch_size * world_size
                 count += batch_size_all
                 if count > len(dataset):
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index b7b32c81..b8ee1258 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -16,9 +16,9 @@ from modelscope.fileio.file import LocalStorage
 from modelscope.metainfo import (Heads, Metrics, Models, Pipelines,
                                  Preprocessors, TaskModels, Trainers)
 from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.file_utils import get_default_cache_dir
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import default_group
-from modelscope.utils.utils import get_default_cache_dir
 
 logger = get_logger()
 storage = LocalStorage()
diff --git a/modelscope/utils/utils.py b/modelscope/utils/file_utils.py
similarity index 96%
rename from modelscope/utils/utils.py
rename to modelscope/utils/file_utils.py
index c2c47092..a04d890f 100644
--- a/modelscope/utils/utils.py
+++ b/modelscope/utils/file_utils.py
@@ -5,7 +5,7 @@ import os
 
 
 # TODO: remove this api, unify to flattened args
-def if_func_receive_dict_inputs(func):
+def func_receive_dict_inputs(func):
     """to decide if a func could recieve dict inputs or not
 
     Args:
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 5af67944..6e5326f4 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -98,4 +98,14 @@ def parse_label_mapping(model_dir):
             label_mapping = json.load(f)
         label2id = {name: idx for name, idx in label_mapping.items()}
 
+    if label2id is None:
+        config_path = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        config = Config.from_file(config_path)
+        if hasattr(config, 'model') and hasattr(config.model, 'label2id'):
+            label2id = config.model.label2id
+    if label2id is None:
+        config_path = os.path.join(model_dir, 'config.json')
+        config = Config.from_file(config_path)
+        if hasattr(config, 'label2id'):
+            label2id = config.label2id
     return label2id
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 01b68f78..aca103d2 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -68,7 +68,7 @@ def torch_default_data_collator(features):
                          ) and v is not None and not isinstance(v, str):
                 if isinstance(v, torch.Tensor):
                     batch[k] = torch.stack([f[k] for f in features])
-                elif isinstance(v, list):
+                elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
                     batch[k] = torch.stack([d for f in features for d in f[k]])
                 else:
                     batch[k] = torch.tensor(np.array([f[k] for f in features]))
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index deb6a5bd..c69174fe 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -4,5 +4,5 @@ pai-easynlp
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
-sofa>=1.0.5
+seqeval
 spacy>=2.3.5
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index fbf33854..5675f031 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -15,5 +15,5 @@ setuptools
 tensorboard
 tokenizers
 tqdm>=4.64.0
-transformers>=4.10.3
+transformers>=4.12.0
 yapf
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/metrics/test_token_classification_metrics.py b/tests/metrics/test_token_classification_metrics.py
new file mode 100644
index 00000000..b249b227
--- /dev/null
+++ b/tests/metrics/test_token_classification_metrics.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import numpy as np
+
+from modelscope.metrics.token_classification_metric import \
+    TokenClassificationMetric
+from modelscope.utils.test_utils import test_level
+
+
+class TestTokenClsMetrics(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_value(self):
+        metric = TokenClassificationMetric()
+
+        class Trainer:
+            pass
+
+        metric.trainer = Trainer()
+        metric.trainer.label2id = {
+            'B-obj': 0,
+            'I-obj': 1,
+            'O': 2,
+        }
+
+        outputs = {
+            'logits':
+            np.array([[[2.0, 1.0, 0.5], [1.0, 1.5, 1.0], [2.0, 1.0, 3.0],
+                       [2.4, 1.5, 4.0], [2.0, 1.0, 3.0], [2.4, 1.5, 1.7],
+                       [2.0, 1.0, 0.5], [2.4, 1.5, 0.5]]])
+        }
+        inputs = {'labels': np.array([[0, 1, 2, 2, 0, 1, 2, 2]])}
+        metric.add(outputs, inputs)
+        ret = metric.evaluate()
+        self.assertTrue(np.isclose(ret['precision'], 0.25))
+        self.assertTrue(np.isclose(ret['recall'], 0.5))
+        self.assertTrue(np.isclose(ret['accuracy'], 0.5))
+        print(ret)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/models/test_base_torch.py b/tests/models/test_base_torch.py
index dcdf79be..c147259b 100644
--- a/tests/models/test_base_torch.py
+++ b/tests/models/test_base_torch.py
@@ -21,8 +21,8 @@ class TorchBaseTest(unittest.TestCase):
                 self.conv1 = nn.Conv2d(1, 20, 5)
                 self.conv2 = nn.Conv2d(20, 20, 5)
 
-            def forward(self, x):
-                x = F.relu(self.conv1(x))
+            def forward(self, input):
+                x = F.relu(self.conv1(input))
                 return F.relu(self.conv2(x))
 
         model = MyTorchModel()
@@ -41,8 +41,8 @@ class TorchBaseTest(unittest.TestCase):
                 self.conv1 = nn.Conv2d(1, 20, 5)
                 self.conv2 = nn.Conv2d(20, 20, 5)
 
-            def forward(self, x):
-                x = F.relu(self.conv1(x))
+            def forward(self, input):
+                x = F.relu(self.conv1(input))
                 return F.relu(self.conv2(x))
 
             def postprocess(self, x):
diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index 449b0cb7..a5c29f16 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -12,7 +12,7 @@ class TranslationTest(unittest.TestCase):
     model_id = 'damo/nlp_csanmt_translation'
     inputs = 'Gut@@ ach : Incre@@ ased safety for pedestri@@ ans'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(task=Tasks.translation, model=self.model_id)
         print(pipeline_ins(input=self.inputs))
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index b028cfbe..2f57b2d8 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -45,7 +45,7 @@ class FillMaskTest(unittest.TestCase):
             model_dir = snapshot_download(self.model_id_sbert[language])
             preprocessor = FillMaskPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
-            model = StructBertForMaskedLM(model_dir)
+            model = StructBertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
             pipeline2 = pipeline(
                 Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -60,7 +60,7 @@ class FillMaskTest(unittest.TestCase):
         model_dir = snapshot_download(self.model_id_veco)
         preprocessor = FillMaskPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
-        model = VecoForMaskedLM(model_dir)
+        model = VecoForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -77,7 +77,7 @@ class FillMaskTest(unittest.TestCase):
         model_dir = snapshot_download(self.model_id_bert)
         preprocessor = FillMaskPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
-        model = BertForMaskedLM(model_dir)
+        model = BertForMaskedLM.from_pretrained(model_dir)
         pipeline1 = FillMaskPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.fill_mask, model=model, preprocessor=preprocessor)
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index 8d5d3dfa..f477fb37 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -3,10 +3,10 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForNLI
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import NLIPipeline
-from modelscope.preprocessors import NLIPreprocessor
+from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
+from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -19,9 +19,10 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = NLIPreprocessor(cache_path)
-        model = SbertForNLI(cache_path, tokenizer=tokenizer)
-        pipeline1 = NLIPipeline(model, preprocessor=tokenizer)
+        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
+        pipeline1 = PairSentenceClassificationPipeline(
+            model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
@@ -33,7 +34,7 @@ class NLITest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = NLIPreprocessor(model.model_dir)
+        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.nli, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 8cfb2c20..7a30d779 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -4,10 +4,10 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForSentenceSimilarity
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SentenceSimilarityPipeline
-from modelscope.preprocessors import SentenceSimilarityPreprocessor
+from modelscope.pipelines.nlp import PairSentenceClassificationPipeline
+from modelscope.preprocessors import PairSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -20,9 +20,10 @@ class SentenceSimilarityTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SentenceSimilarityPreprocessor(cache_path)
-        model = SbertForSentenceSimilarity(cache_path, tokenizer=tokenizer)
-        pipeline1 = SentenceSimilarityPipeline(model, preprocessor=tokenizer)
+        tokenizer = PairSentenceClassificationPreprocessor(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
+        pipeline1 = PairSentenceClassificationPipeline(
+            model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
         print('test1')
@@ -36,7 +37,7 @@ class SentenceSimilarityTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SentenceSimilarityPreprocessor(model.model_dir)
+        tokenizer = PairSentenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentence_similarity,
             model=model,
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index 53031e9d..82c068be 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -3,11 +3,10 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (SbertForSentimentClassification,
-                                   SequenceClassificationModel)
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SentimentClassificationPipeline
-from modelscope.preprocessors import SentimentClassificationPreprocessor
+from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
+from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -19,46 +18,52 @@ class SentimentClassificationTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = SentimentClassificationPreprocessor(cache_path)
-        model = SequenceClassificationModel.from_pretrained(
+        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(
             self.model_id, num_labels=2)
-        pipeline1 = SentimentClassificationPipeline(
+        pipeline1 = SingleSentenceClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentiment_classification,
             model=model,
-            preprocessor=tokenizer,
-            model_revision='beta')
+            preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1:{pipeline1(input=self.sentence1)}')
         print()
         print(f'sentence1: {self.sentence1}\n'
               f'pipeline1: {pipeline2(input=self.sentence1)}')
+        self.assertTrue(
+            isinstance(pipeline1.model, SbertForSequenceClassification))
+        self.assertTrue(
+            isinstance(pipeline2.model, SbertForSequenceClassification))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = SentimentClassificationPreprocessor(model.model_dir)
+        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.sentiment_classification,
             model=model,
-            preprocessor=tokenizer,
-            model_revision='beta')
+            preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SbertForSequenceClassification))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification,
-            model=self.model_id,
-            model_revision='beta')
+            task=Tasks.sentiment_classification, model=self.model_id)
         print(pipeline_ins(input=self.sentence1))
+        print(pipeline_ins.model.__class__)
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SbertForSequenceClassification))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(
-            task=Tasks.sentiment_classification, model_revision='beta')
+        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
         print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SbertForSequenceClassification))
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_sentiment_classification_task_model.py b/tests/pipelines/test_sentiment_classification_task_model.py
new file mode 100644
index 00000000..2808ec84
--- /dev/null
+++ b/tests/pipelines/test_sentiment_classification_task_model.py
@@ -0,0 +1,70 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp.task_models.sequence_classification import \
+    SequenceClassificationModel
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import SingleSentenceClassificationPipeline
+from modelscope.preprocessors import SingleSentenceClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SentimentClassificationTaskModelTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+    sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
+
+    @unittest.skip
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SingleSentenceClassificationPreprocessor(cache_path)
+        model = SequenceClassificationModel.from_pretrained(
+            self.model_id, num_labels=2)
+        pipeline1 = SingleSentenceClassificationPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentiment_classification,
+            model=model,
+            preprocessor=tokenizer,
+            model_revision='beta')
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline1(input=self.sentence1)}')
+        print()
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1: {pipeline2(input=self.sentence1)}')
+
+    @unittest.skip
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id, revision='beta')
+        tokenizer = SingleSentenceClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SequenceClassificationModel))
+
+    @unittest.skip
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification,
+            model=self.model_id,
+            model_revision='beta')
+        print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SequenceClassificationModel))
+
+    @unittest.skip
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification, model_revision='beta')
+        print(pipeline_ins(input=self.sentence1))
+        self.assertTrue(
+            isinstance(pipeline_ins.model, SequenceClassificationModel))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index fd397de3..c391e0a1 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -39,7 +39,7 @@ class TextGenerationTest(unittest.TestCase):
         for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
                                 (self.palm_model_id_en, self.palm_input_en)):
             cache_path = snapshot_download(model_id)
-            model = PalmForTextGeneration(cache_path)
+            model = PalmForTextGeneration.from_pretrained(cache_path)
             preprocessor = TextGenerationPreprocessor(
                 cache_path,
                 model.tokenizer,
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 5e3571f7..98fab808 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -20,7 +20,7 @@ class WordSegmentationTest(unittest.TestCase):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = TokenClassificationPreprocessor(cache_path)
-        model = SbertForTokenClassification(cache_path, tokenizer=tokenizer)
+        model = SbertForTokenClassification.from_pretrained(cache_path)
         pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index df0098f0..ee0b5bae 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -3,7 +3,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForZeroShotClassification
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import ZeroShotClassificationPipeline
 from modelscope.preprocessors import ZeroShotClassificationPreprocessor
@@ -21,7 +21,7 @@ class ZeroShotClassificationTest(unittest.TestCase):
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = ZeroShotClassificationPreprocessor(cache_path)
-        model = SbertForZeroShotClassification(cache_path, tokenizer=tokenizer)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
         pipeline1 = ZeroShotClassificationPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py
new file mode 100644
index 00000000..fc59750d
--- /dev/null
+++ b/tests/taskdataset/test_veco_dataset.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.task_datasets.veco_dataset import VecoDataset
+from modelscope.utils.test_utils import test_level
+
+
+class TestVecoDataset(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_veco_dataset_train(self):
+        from datasets import Dataset
+        d0 = Dataset.from_dict({'a': [0, 1, 2]})
+        d1 = Dataset.from_dict({'a': [10, 11, 12, 13, 14]})
+        d2 = Dataset.from_dict({'a': [21, 22, 23, 24, 25, 26, 27]})
+        dataset = VecoDataset([d0, d1, d2], mode='train')
+        self.assertEqual(len(dataset), 15)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_veco_dataset_eval(self):
+        from datasets import Dataset
+        d0 = Dataset.from_dict({'a': [0, 1, 2]})
+        d1 = Dataset.from_dict({'a': [10, 11, 12, 13, 14]})
+        d2 = Dataset.from_dict({'a': [21, 22, 23, 24, 25, 26, 27]})
+        dataset = VecoDataset([d0, d1, d2], mode='eval')
+        self.assertEqual(len(dataset), 3)
+        dataset.switch_dataset(1)
+        self.assertEqual(len(dataset), 5)
+        dataset.switch_dataset(2)
+        self.assertEqual(len(dataset), 7)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index afb887a4..7e057ff0 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -270,6 +270,7 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
             trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
+        trainer.train_dataloader = train_dataloader
         trainer.data_loader = train_dataloader
         trainer.register_optimizers_hook()
         trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
new file mode 100644
index 00000000..8e147f92
--- /dev/null
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -0,0 +1,244 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.trainers import build_trainer
+
+
+class TestFinetuneSequenceClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name='NlpEpochBasedTrainer',
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(10):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skip
+    def test_finetune_afqmc(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'sentence-similarity'
+            cfg['preprocessor'] = {'type': 'sen-sim-tokenizer'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['0', '1'],
+                    'first_sequence': 'sentence1',
+                    'second_sequence': 'sentence2',
+                    'label': 'label',
+                }
+            }
+            cfg.train.max_epochs = 10
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        from datasets import load_dataset
+        from datasets import DownloadConfig
+        dc = DownloadConfig()
+        dc.local_files_only = True
+        dataset = load_dataset('clue', 'afqmc', download_config=dc)
+        self.finetune(
+            model_id='damo/nlp_structbert_backbone_tiny_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn)
+
+    @unittest.skip
+    def test_finetune_tnews(self):
+
+        def cfg_modify_fn(cfg):
+            # TODO no proper task for tnews
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'labels': [
+                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
+                        '11', '12', '13', '14'
+                    ],
+                    'first_sequence':
+                    'sentence',
+                    'label':
+                    'label',
+                }
+            }
+            cfg.train.max_epochs = 5
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 100
+            }]
+            return cfg
+
+        from datasets import load_dataset
+        from datasets import DownloadConfig
+        dc = DownloadConfig()
+        dc.local_files_only = True
+        dataset = load_dataset('clue', 'tnews', download_config=dc)
+
+        self.finetune(
+            model_id='damo/nlp_structbert_backbone_tiny_std',
+            train_dataset=dataset['train'],
+            eval_dataset=dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn)
+
+    @unittest.skip
+    def test_veco_xnli(self):
+        from datasets import load_dataset
+        langs = ['en']
+        langs_eval = ['en']
+        train_datasets = []
+        from datasets import DownloadConfig
+        dc = DownloadConfig()
+        dc.local_files_only = True
+        for lang in langs:
+            train_datasets.append(
+                load_dataset('xnli', lang, split='train', download_config=dc))
+        eval_datasets = []
+        for lang in langs_eval:
+            eval_datasets.append(
+                load_dataset(
+                    'xnli', lang, split='validation', download_config=dc))
+        train_len = sum([len(dataset) for dataset in train_datasets])
+        labels = ['0', '1', '2']
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'nli'
+            cfg['preprocessor'] = {'type': 'nli-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'first_sequence': 'premise',
+                    'second_sequence': 'hypothesis',
+                    'labels': labels,
+                    'label': 'label',
+                }
+            }
+            cfg['train'] = {
+                'work_dir':
+                '/tmp',
+                'max_epochs':
+                2,
+                'dataloader': {
+                    'batch_size_per_gpu': 16,
+                    'workers_per_gpu': 1
+                },
+                'optimizer': {
+                    'type': 'AdamW',
+                    'lr': 2e-5,
+                    'options': {
+                        'cumulative_iters': 8,
+                    }
+                },
+                'lr_scheduler': {
+                    'type': 'LinearLR',
+                    'start_factor': 1.0,
+                    'end_factor': 0.0,
+                    'total_iters': int(train_len / 16) * 2,
+                    'options': {
+                        'by_epoch': False
+                    }
+                },
+                'hooks': [{
+                    'type': 'CheckpointHook',
+                    'interval': 1,
+                    'save_dir': '/root'
+                }, {
+                    'type': 'TextLoggerHook',
+                    'interval': 1
+                }, {
+                    'type': 'IterTimerHook'
+                }, {
+                    'type': 'EvaluationHook',
+                    'by_epoch': False,
+                    'interval': 500
+                }]
+            }
+            cfg['evaluation'] = {
+                'dataloader': {
+                    'batch_size_per_gpu': 128,
+                    'workers_per_gpu': 1,
+                    'shuffle': False
+                }
+            }
+            return cfg
+
+        self.finetune(
+            'damo/nlp_veco_fill-mask-large',
+            train_datasets,
+            eval_datasets,
+            name='VecoTrainer',
+            cfg_modify_fn=cfg_modify_fn)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
new file mode 100644
index 00000000..7449bc69
--- /dev/null
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -0,0 +1,200 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from functools import reduce
+
+from modelscope.trainers import build_trainer
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneTokenClassification(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name='NlpEpochBasedTrainer',
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(10):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skip
+    def test_token_classification(self):
+        # WS task
+        os.system(
+            f'curl http://dingkun.oss-cn-hangzhou-zmf.aliyuncs.com/atemp/train.txt > {self.tmp_dir}/train.txt'
+        )
+        os.system(
+            f'curl http://dingkun.oss-cn-hangzhou-zmf.aliyuncs.com/atemp/dev.txt > {self.tmp_dir}/dev.txt'
+        )
+        from datasets import load_dataset
+        dataset = load_dataset(
+            'text',
+            data_files={
+                'train': f'{self.tmp_dir}/train.txt',
+                'test': f'{self.tmp_dir}/dev.txt'
+            })
+
+        def split_to_dict(examples):
+            text, label = examples['text'].split('\t')
+            return {
+                'first_sequence': text.split(' '),
+                'labels': label.split(' ')
+            }
+
+        dataset = dataset.map(split_to_dict, batched=False)
+
+        def reducer(x, y):
+            x = x.split(' ') if isinstance(x, str) else x
+            y = y.split(' ') if isinstance(y, str) else y
+            return x + y
+
+        label_enumerate_values = list(
+            set(reduce(reducer, dataset['train'][:1000]['labels'])))
+        label_enumerate_values.sort()
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'token-classification'
+            cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
+            cfg['dataset'] = {
+                'train': {
+                    'labels': label_enumerate_values,
+                    'first_sequence': 'first_sequence',
+                    'label': 'labels',
+                }
+            }
+            cfg.train.max_epochs = 3
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(dataset['train']) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 300
+            }]
+            return cfg
+
+        self.finetune(
+            'damo/nlp_structbert_backbone_tiny_std',
+            dataset['train'],
+            dataset['test'],
+            cfg_modify_fn=cfg_modify_fn)
+
+    @unittest.skip
+    def test_word_segmentation(self):
+        os.system(
+            f'curl http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip > {self.tmp_dir}/icwb2-data.zip'
+        )
+        shutil.unpack_archive(f'{self.tmp_dir}/icwb2-data.zip', self.tmp_dir)
+        from datasets import load_dataset
+        from modelscope.preprocessors.nlp import WordSegmentationBlankSetToLabelPreprocessor
+        preprocessor = WordSegmentationBlankSetToLabelPreprocessor()
+        dataset = load_dataset(
+            'text',
+            data_files=f'{self.tmp_dir}/icwb2-data/training/pku_training.utf8')
+
+        def split_to_dict(examples):
+            return preprocessor(examples['text'])
+
+        dataset = dataset.map(split_to_dict, batched=False)
+
+        def reducer(x, y):
+            x = x.split(' ') if isinstance(x, str) else x
+            y = y.split(' ') if isinstance(y, str) else y
+            return x + y
+
+        label_enumerate_values = list(
+            set(reduce(reducer, dataset['train'][:1000]['labels'])))
+        label_enumerate_values.sort()
+
+        train_len = int(len(dataset['train']) * 0.7)
+        train_dataset = dataset['train'].select(range(train_len))
+        dev_dataset = dataset['train'].select(
+            range(train_len, len(dataset['train'])))
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'token-classification'
+            cfg['dataset'] = {
+                'train': {
+                    'labels': label_enumerate_values,
+                    'first_sequence': 'first_sequence',
+                    'label': 'labels',
+                }
+            }
+            cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
+            cfg.train.max_epochs = 3
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'total_iters':
+                int(len(train_dataset) / 32) * cfg.train.max_epochs,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 50
+            }]
+            return cfg
+
+        self.finetune(
+            'damo/nlp_structbert_backbone_tiny_std',
+            train_dataset,
+            dev_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_text_generation_trainer.py b/tests/trainers/test_text_generation_trainer.py
index 7c24bc0a..9c79f2f5 100644
--- a/tests/trainers/test_text_generation_trainer.py
+++ b/tests/trainers/test_text_generation_trainer.py
@@ -5,8 +5,7 @@ import tempfile
 import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.models.nlp.palm_for_text_generation import \
-    PalmForTextGeneration
+from modelscope.models.nlp.palm_v2 import PalmForTextGeneration
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
@@ -50,13 +49,21 @@ class TestTextGenerationTrainer(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.preprocessor.type = 'text-gen-tokenizer'
+            return cfg
+
         kwargs = dict(
             model=self.model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            model_revision='beta')
 
-        trainer = build_trainer(default_args=kwargs)
+        trainer = build_trainer(
+            name='NlpEpochBasedTrainer', default_args=kwargs)
         trainer.train()
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
@@ -69,7 +76,7 @@ class TestTextGenerationTrainer(unittest.TestCase):
         if not os.path.exists(tmp_dir):
             os.makedirs(tmp_dir)
 
-        cache_path = snapshot_download(self.model_id)
+        cache_path = snapshot_download(self.model_id, revision='beta')
         model = PalmForTextGeneration.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
@@ -86,6 +93,44 @@ class TestTextGenerationTrainer(unittest.TestCase):
         for i in range(2):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
+    @unittest.skip
+    def test_finetune_cnndm(self):
+        from datasets import load_dataset
+        dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0')
+        train_dataset = dataset_dict['train'] \
+            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
+            .remove_columns('id')
+        eval_dataset = dataset_dict['validation'] \
+            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
+            .remove_columns('id')
+        num_warmup_steps = 2000
+
+        def noam_lambda(current_step: int):
+            current_step += 1
+            return min(current_step**(-0.5),
+                       current_step * num_warmup_steps**(-1.5))
+
+        def cfg_modify_fn(cfg):
+            cfg.train.lr_scheduler = {
+                'type': 'LambdaLR',
+                'lr_lambda': noam_lambda,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            return cfg
+
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            model_revision='beta')
+        trainer = build_trainer(
+            name='NlpEpochBasedTrainer', default_args=kwargs)
+        trainer.train()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 603d6e5b..a2d899ba 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -6,8 +6,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
-from modelscope.models.nlp.sbert_for_sequence_classification import \
-    SbertTextClassfier
+from modelscope.models.nlp.sequence_classification import \
+    SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile
@@ -102,7 +102,7 @@ class TestTrainerWithNlp(unittest.TestCase):
 
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         cache_path = snapshot_download(model_id)
-        model = SbertTextClassfier.from_pretrained(cache_path)
+        model = SbertForSequenceClassification.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,